apache · JoshRosen · Jul 24, 2015 · Jul 24, 2015 · Jul 11, 2015 · Jul 11, 2015
diff --git a/pom.xml b/pom.xml
@@ -699,6 +699,12 @@
         <artifactId>scalap</artifactId>
         <version>${scala.version}</version>
       </dependency>
+      <dependency>
+        <groupId>org.scalaz</groupId>
+        <artifactId>scalaz-core_2.10</artifactId>
+        <version>7.1.3</version>
+        <scope>test</scope>
+      </dependency>
       <dependency>
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest_${scala.binary.version}</artifactId>

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -245,6 +245,12 @@ trait CheckAnalysis extends PredicateHelper {
             aggregateExprs.foreach(checkValidAggregateExpression)
             groupingExprs.foreach(checkValidGroupingExprs)
 
+          case s @ SetOperation(left, right) if left.output.length != right.output.length =>
+            failAnalysis(
+              s"${s.nodeName} can only be performed on tables with the same number of columns, " +
+               s"but the left table has ${left.output.length} columns and the right has " +
+               s"${right.output.length}")
+
           case Sort(orders, _, _) =>
             orders.foreach { order =>
               if (!RowOrdering.isOrderable(order.dataType)) {

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -209,7 +209,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
   }
 
   private[this] def decimalToTimestamp(d: Decimal): Long = {
-    (d.toBigDecimal * 1000000L).longValue()
+    d.toJavaBigDecimal.multiply(java.math.BigDecimal.valueOf(1000000L)).longValue()
   }
   private[this] def doubleToTimestamp(d: Double): Any = {
     if (d.isNaN || d.isInfinite) null else (d * 1000000L).toLong

diff --git a/...lyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/...lyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
@@ -98,13 +98,13 @@ trait ExtractValue extends Expression
 /**
  * Returns the value of fields in the Struct `child`.
  *
- * No need to do type checking since it is handled by [[ExtractValue]].
- *
  * Note that we can pass in the field name directly to keep case preserving in `toString`.
  * For example, when get field `yEAr` from `<year: int, month: int>`, we should pass in `yEAr`.
  */
 case class GetStructField(child: Expression, ordinal: Int, name: Option[String] = None)
-  extends UnaryExpression with ExtractValue {
+  extends UnaryExpression with ExtractValue with ExpectsInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StructType)
 
   lazy val childSchema = child.dataType.asInstanceOf[StructType]
 
@@ -144,16 +144,15 @@ case class GetStructField(child: Expression, ordinal: Int, name: Option[String]
 /**
  * For a child whose data type is an array of structs, extracts the `ordinal`-th fields of all array
  * elements, and returns them as a new array.
- *
- * No need to do type checking since it is handled by [[ExtractValue]].
  */
 case class GetArrayStructFields(
     child: Expression,
     field: StructField,
     ordinal: Int,
     numFields: Int,
-    containsNull: Boolean) extends UnaryExpression with ExtractValue {
+    containsNull: Boolean) extends UnaryExpression with ExtractValue with ExpectsInputTypes {
 
+  override def inputTypes: Seq[AbstractDataType] = Seq(ArrayType)
   override def dataType: DataType = ArrayType(field.dataType, containsNull)
   override def toString: String = s"$child.${field.name}"
   override def sql: String = s"${child.sql}.${quoteIdentifier(field.name)}"
@@ -215,8 +214,7 @@ case class GetArrayStructFields(
 case class GetArrayItem(child: Expression, ordinal: Expression)
   extends BinaryExpression with ExpectsInputTypes with ExtractValue {
 
-  // We have done type checking for child in `ExtractValue`, so only need to check the `ordinal`.
-  override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType, IntegralType)
+  override def inputTypes: Seq[AbstractDataType] = Seq(ArrayType, IntegralType)
 
   override def toString: String = s"$child[$ordinal]"
   override def sql: String = s"${child.sql}[${ordinal.sql}]"
@@ -264,8 +262,7 @@ case class GetMapValue(child: Expression, key: Expression)
 
   private def keyType = child.dataType.asInstanceOf[MapType].keyType
 
-  // We have done type checking for child in `ExtractValue`, so only need to check the `key`.
-  override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType, keyType)
+  override def inputTypes: Seq[AbstractDataType] = Seq(MapType, keyType)
 
   override def toString: String = s"$child[$key]"
   override def sql: String = s"${child.sql}[${key.sql}]"

diff --git a/...atalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala b/...atalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala
@@ -23,11 +23,11 @@ import org.apache.spark.sql.types._
 
 /**
  * Return the unscaled Long value of a Decimal, assuming it fits in a Long.
- * Note: this expression is internal and created only by the optimizer,
- * we don't need to do type check for it.
+ * Note: this expression is internal and created only by the optimizer.
  */
-case class UnscaledValue(child: Expression) extends UnaryExpression {
+case class UnscaledValue(child: Expression) extends UnaryExpression with ExpectsInputTypes {
 
+  override def inputTypes: Seq[AbstractDataType] = Seq(DecimalType)
   override def dataType: DataType = LongType
   override def toString: String = s"UnscaledValue($child)"
 
@@ -41,11 +41,15 @@ case class UnscaledValue(child: Expression) extends UnaryExpression {
 
 /**
  * Create a Decimal from an unscaled Long value.
- * Note: this expression is internal and created only by the optimizer,
- * we don't need to do type check for it.
+ * Note: this expression is internal and created only by the optimizer.
  */
-case class MakeDecimal(child: Expression, precision: Int, scale: Int) extends UnaryExpression {
+case class MakeDecimal(
+    child: Expression,
+    precision: Int,
+    scale: Int)
+  extends UnaryExpression with ExpectsInputTypes {
 
+  override def inputTypes: Seq[AbstractDataType] = Seq(LongType)
   override def dataType: DataType = DecimalType(precision, scale)
   override def nullable: Boolean = true
   override def toString: String = s"MakeDecimal($child,$precision,$scale)"
@@ -80,7 +84,12 @@ case class PromotePrecision(child: Expression) extends UnaryExpression {
  * Rounds the decimal to given scale and check whether the decimal can fit in provided precision
  * or not, returns null if not.
  */
-case class CheckOverflow(child: Expression, dataType: DecimalType) extends UnaryExpression {
+case class CheckOverflow(
+    child: Expression,
+    dataType: DecimalType)
+  extends UnaryExpression with ExpectsInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(DecimalType)
 
   override def nullable: Boolean = true
 

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala
@@ -21,6 +21,15 @@ import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions.Attribute
 
 object JoinType {
+
+  val supportedJoinTypes = Seq(
+    "inner",
+    "outer", "full", "fullouter",
+    "leftouter", "left",
+    "rightouter", "right",
+    "leftsemi",
+    "leftanti")
+
   def apply(typ: String): JoinType = typ.toLowerCase.replace("_", "") match {
     case "inner" => Inner
     case "outer" | "full" | "fullouter" => FullOuter
@@ -29,16 +38,8 @@ object JoinType {
     case "leftsemi" => LeftSemi
     case "leftanti" => LeftAnti
     case _ =>
-      val supported = Seq(
-        "inner",
-        "outer", "full", "fullouter",
-        "leftouter", "left",
-        "rightouter", "right",
-        "leftsemi",
-        "leftanti")
-
       throw new IllegalArgumentException(s"Unsupported join type '$typ'. " +
-        "Supported join types include: " + supported.mkString("'", "', '", "'") + ".")
+        "Supported join types include: " + supportedJoinTypes.mkString("'", "', '", "'") + ".")
   }
 }
 

diff --git a/sql/core/pom.xml b/sql/core/pom.xml
@@ -127,6 +127,18 @@
       <artifactId>xbean-asm5-shaded</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.clapper</groupId>
+      <artifactId>classutil_${scala.binary.version}</artifactId>
+      <version>1.0.6</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.scalaz</groupId>
+      <artifactId>scalaz-core_${scala.binary.version}</artifactId>
+      <version>7.2.3</version>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -150,7 +150,8 @@ class UDFSuite extends QueryTest with SharedSQLContext {
     assert(result.count() === 2)
   }
 
-  test("UDFs everywhere") {
+  // Temporarily ignored until we implement code generation for ScalaUDF.
+  ignore("UDFs everywhere") {
     spark.udf.register("groupFunction", (n: Int) => { n > 10 })
     spark.udf.register("havingFilter", (n: Long) => { n > 2000 })
     spark.udf.register("whereFilter", (n: Int) => { n < 150 })

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/fuzzing/DataFrameFuzzingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/fuzzing/DataFrameFuzzingSuite.scala
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.fuzzing
+
+import scala.util.Random
+import scala.util.control.NonFatal
+
+import org.apache.spark.{SharedSparkContext, SparkFunSuite}
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.analysis.UnresolvedException
+import org.apache.spark.sql.catalyst.encoders.RowEncoder
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
+
+object DataFrameFuzzingUtils {
+
+  def randomChoice[T](values: Seq[T]): T = {
+    values(Random.nextInt(values.length))
+  }
+
+  /**
+   * Build a list of column names and types for the given StructType, taking nesting into account.
+   * For nested struct fields, this will emit both the column for the struct field itself as well as
+   * fields for the nested struct's fields. This process will be performed recursively in order to
+   * handle deeply-nested structs.
+   */
+  def getColumnsAndTypes(struct: StructType): Seq[(String, DataType)] = {
+    struct.flatMap { field =>
+      val nestedFieldInfos: Seq[(String, DataType)] = field.dataType match {
+        case nestedStruct: StructType =>
+          Seq((field.name, field.dataType)) ++ getColumnsAndTypes(nestedStruct).map {
+            case (nestedColName, dataType) => (field.name + "." + nestedColName, dataType)
+          }
+        case _ => Seq.empty
+      }
+      Seq((field.name, field.dataType)) ++ nestedFieldInfos
+    }
+  }
+
+  def getRandomColumnName(
+      df: DataFrame,
+      condition: DataType => Boolean = _ => true): Option[String] = {
+    val columnsWithTypes = getColumnsAndTypes(df.schema)
+    val candidateColumns = columnsWithTypes.filter(c => condition(c._2))
+    if (candidateColumns.isEmpty) {
+      None
+    } else {
+      Some(randomChoice(candidateColumns)._1)
+    }
+  }
+}
+
+
+/**
+ * This test suite generates random data frames, then applies random sequences of operations to
+ * them in order to construct random queries. We don't have a source of truth for these random
+ * queries but nevertheless they are still useful for testing that we don't crash in bad ways.
+ */
+class DataFrameFuzzingSuite extends QueryTest with SharedSparkContext {
+
+
+  override protected def spark: SparkSession = sqlContext.sparkSession
+
+  val tempDir = Utils.createTempDir()
+
+  private var sqlContext: SQLContext = _
+  private var dataGenerator: RandomDataFrameGenerator = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    sqlContext = new SQLContext(sc)
+    dataGenerator = new RandomDataFrameGenerator(123, sqlContext)
+    sqlContext.conf.setConf(SQLConf.SHUFFLE_PARTITIONS, 10)
+  }
+
+  def tryToExecute(df: DataFrame): DataFrame = {
+    try {
+      df.rdd.count()
+      df
+    } catch {
+      case NonFatal(e) =>
+        // scalastyle:off println
+        println(df.queryExecution)
+        // scalastyle:on println
+        throw e
+    }
+  }
+
+  // TODO: make these regexes.
+  val ignoredAnalysisExceptionMessages = Seq(
+    // TODO: filter only for binary type:
+    "cannot sort data type array<",
+    "cannot be used in grouping expression",
+    "cannot be used in join condition",
+    "can only be performed on tables with the same number of columns",
+    "number of columns doesn't match",
+    "unsupported join type",
+    "is neither present in the group by, nor is it an aggregate function",
+    "is ambiguous, could be:",
+    "unresolved operator 'Project", // TODO
+    "unresolved operator 'Union", // TODO: disabled to let me find new errors
+    "unresolved operator 'Except", // TODO: disabled to let me find new errors
+    "unresolved operator 'Intersect", // TODO: disabled to let me find new errors
+    "Cannot resolve column name" // TODO: only ignore for join?
+  )
+
+  def getRandomTransformation(df: DataFrame): DataFrameTransformation = {
+    (1 to 1000).iterator.map(_ => ReflectiveFuzzing.getTransformation(df)).flatten.next()
+  }
+
+  def applyRandomTransform(df: DataFrame): DataFrame = {
+    val tf = getRandomTransformation(df)
+    // scalastyle:off println
+    println("    " + tf)
+    // scalastyle:on println
+    tf.apply(df)
+  }
+
+  def resetConfs(): Unit = {
+    sqlContext.conf.getAllDefinedConfs.foreach { case (key, defaultValue, doc) =>
+      sqlContext.conf.setConfString(key, defaultValue)
+    }
+    sqlContext.conf.setConfString("spark.sql.crossJoin.enabled", "true")
+    sqlContext.conf.setConfString("spark.sql.autoBroadcastJoinThreshold", "-1")
+  }
+
+  private val configurations = Seq(
+    "default" -> Seq(),
+    "no optimization" -> Seq(SQLConf.OPTIMIZER_MAX_ITERATIONS.key -> "0"),
+    "disable-wholestage-codegen" -> Seq(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false"),
+    "disable-exchange-reuse" -> Seq(SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false")
+  )
+
+  def replan(df: DataFrame): DataFrame = {
+    new Dataset[Row](sqlContext.sparkSession, df.logicalPlan, RowEncoder(df.schema))
+  }
+
+  test("fuzz test") {
+    for (i <- 1 to 1000) {
+      // scalastyle:off println
+      println(s"Iteration $i")
+      // scalastyle:on println
+      try {
+        resetConfs()
+        var df = dataGenerator.randomDataFrame(
+          numCols = Random.nextInt(2) + 1,
+          numRows = 20,
+          allowComplexTypes = false)
+        var depth = 3
+        while (depth > 0) {
+          df = tryToExecute(applyRandomTransform(df))
+          depth -= 1
+        }
+        val defaultResult = replan(df).collect()
+        configurations.foreach { case (confName, confsToSet) =>
+          resetConfs()
+          withClue(s"configuration = $confName") {
+            confsToSet.foreach { case (key, value) =>
+              sqlContext.conf.setConfString(key, value)
+            }
+            checkAnswer(replan(df), defaultResult)
+          }
+        }
+        println(s"Finished all tests successfully for plan:\n${df.logicalPlan}")
+      } catch {
+        case e: UnresolvedException[_] =>
+//            println("skipped due to unresolved")
+        case e: Exception
+          if ignoredAnalysisExceptionMessages.exists {
+            m => Option(e.getMessage).getOrElse("").toLowerCase.contains(m.toLowerCase)
+          } =>
+//            println("Skipped due to expected AnalysisException " + e)
+      }
+    }
+  }
+}