apache · davies · Feb 12, 2016 · Feb 12, 2016 · Feb 12, 2016 · Feb 13, 2016
diff --git a/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/ExpressionParser.g b/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/ExpressionParser.g
@@ -205,6 +205,8 @@ atomExpression
     | whenExpression
     | (functionName LPAREN) => function
     | tableOrColumn
+    | (LPAREN KW_SELECT) => subQueryExpression
+      -> ^(TOK_SUBQUERY_EXPR ^(TOK_SUBQUERY_OP) subQueryExpression)
     | LPAREN! expression RPAREN!
     ;
 

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystQl.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystQl.scala
@@ -667,6 +667,8 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
           UnresolvedAttribute(nameParts :+ cleanIdentifier(attr))
         case other => UnresolvedExtractValue(other, Literal(cleanIdentifier(attr)))
       }
+    case Token("TOK_SUBQUERY_EXPR", Token("TOK_SUBQUERY_OP", Nil) :: subquery :: Nil) =>
+      ScalarSubquery(nodeToPlan(subquery))
 
     /* Stars (*) */
     case Token("TOK_ALLCOLREF", Nil) => UnresolvedStar(None)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -80,6 +80,7 @@ class Analyzer(
       ResolveGenerate ::
       ResolveFunctions ::
       ResolveAliases ::
+      ResolveSubquery ::
       ResolveWindowOrder ::
       ResolveWindowFrame ::
       ResolveNaturalJoin ::
@@ -120,7 +121,13 @@ class Analyzer(
             withAlias.getOrElse(relation)
           }
           substituted.getOrElse(u)
+        case other =>
+          other transformExpressions {
+            case e: SubqueryExpression =>
+              e.withNewPlan(substituteCTE(e.query, cteRelations))
+          }
       }
+
     }
   }
 
@@ -693,6 +700,28 @@ class Analyzer(
     }
   }
 
+  /**
+    * This rule resolve subqueries inside expressions.
+    */
+  object ResolveSubquery extends Rule[LogicalPlan] with PredicateHelper {
+
+    private def hasSubquery(e: Expression): Boolean = {
+      e.find(_.isInstanceOf[SubqueryExpression]).isDefined
+    }
+
+    private def hasSubquery(q: LogicalPlan): Boolean = {
+      q.expressions.exists(hasSubquery)
+    }
+
+    def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
+      case q: LogicalPlan if q.childrenResolved && hasSubquery(q) =>
+        q transformExpressions {
+          case e: SubqueryExpression if !e.query.resolved =>
+            e.withNewPlan(execute(e.query))
+        }
+    }
+  }
+
   /**
    * Turns projections that contain aggregate expressions into aggregations.
    */

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.types.DataType
+
+/**
+  * A interface for subquery that is used in expressions.
+  */
+trait SubqueryExpression extends LeafExpression {
+  def query: LogicalPlan
+  def withNewPlan(plan: LogicalPlan): SubqueryExpression
+}
+
+/**
+  * A subquery that will return only one row and one column.
+  */
+case class ScalarSubquery(query: LogicalPlan) extends SubqueryExpression with CodegenFallback {
+
+  override lazy val resolved: Boolean = query.resolved
+
+  override def dataType: DataType = query.schema.fields.head.dataType
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    if (query.schema.length != 1) {
+      TypeCheckResult.TypeCheckFailure("Scalar subquery can only have 1 column, but got " +
+        query.schema.length.toString)
+    } else {
+      TypeCheckResult.TypeCheckSuccess
+    }
+  }
+
+  // It can not be evaluated by optimizer.
+  override def foldable: Boolean = false
+  override def nullable: Boolean = true
+
+  override def withNewPlan(plan: LogicalPlan): ScalarSubquery = ScalarSubquery(plan)
+
+  // TODO: support sql()
+
+  // the first column in first row from `query`.
+  private var result: Any = null
+
+  def updateResult(v: Any): Unit = {
+    result = v
+  }
+
+  override def eval(input: InternalRow): Any = result
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala
@@ -22,6 +22,7 @@ import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.types.BooleanType
 import org.apache.spark.unsafe.types.CalendarInterval
 
 class CatalystQlSuite extends PlanTest {
@@ -201,4 +202,49 @@ class CatalystQlSuite extends PlanTest {
     parser.parsePlan("select sum(product + 1) over (partition by (product + (1)) order by 2) " +
       "from windowData")
   }
+
+  test("subquery") {
+    comparePlans(
+      parser.parsePlan("select (select max(b) from s) ss from t"),
+      Project(
+        UnresolvedAlias(
+          Alias(
+            ScalarSubquery(
+              Project(
+                UnresolvedAlias(
+                  UnresolvedFunction("max", UnresolvedAttribute("b") :: Nil, false)) :: Nil,
+                UnresolvedRelation(TableIdentifier("s")))),
+            "ss")(ExprId(0))) :: Nil,
+        UnresolvedRelation(TableIdentifier("t"))))
+    comparePlans(
+      parser.parsePlan("select * from t where a = (select b from s)"),
+      Project(
+        UnresolvedAlias(
+          UnresolvedStar(None)) :: Nil,
+        Filter(
+          EqualTo(
+            UnresolvedAttribute("a"),
+              ScalarSubquery(
+                Project(
+                  UnresolvedAlias(
+                    UnresolvedAttribute("b")) :: Nil,
+                  UnresolvedRelation(TableIdentifier("s"))))),
+          UnresolvedRelation(TableIdentifier("t")))))
+    comparePlans(
+      parser.parsePlan("select * from t group by g having a > (select b from s)"),
+      Filter(
+        Cast(
+          GreaterThan(
+            UnresolvedAttribute("a"),
+            ScalarSubquery(
+              Project(
+                UnresolvedAlias(
+                  UnresolvedAttribute("b")) :: Nil,
+                UnresolvedRelation(TableIdentifier("s"))))),
+          BooleanType),
+        Aggregate(
+          UnresolvedAttribute("g") :: Nil,
+          UnresolvedAlias(UnresolvedStar(None)) :: Nil,
+          UnresolvedRelation(TableIdentifier("t")))))
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -20,17 +20,20 @@ package org.apache.spark.sql.execution
 import java.util.concurrent.atomic.AtomicBoolean
 
 import scala.collection.mutable.ArrayBuffer
+import scala.concurrent.{Await, ExecutionContext, Future}
+import scala.concurrent.duration._
 
 import org.apache.spark.Logging
 import org.apache.spark.rdd.{RDD, RDDOperationScope}
-import org.apache.spark.sql.{Row, SQLContext}
+import org.apache.spark.sql.{DataFrame, Row, SQLContext}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.metric.{LongSQLMetric, SQLMetric}
 import org.apache.spark.sql.types.DataType
+import org.apache.spark.util.ThreadUtils
 
 /**
  * The base class for physical operators.
@@ -122,7 +125,42 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
   final def prepare(): Unit = {
     if (prepareCalled.compareAndSet(false, true)) {
       doPrepare()
+
+      // collect all the subqueries and submit jobs to execute them in background
+      val queryResults = ArrayBuffer[(ScalarSubquery, Future[Array[InternalRow]])]()
+      val allSubqueries = expressions.flatMap(_.collect {case e: ScalarSubquery => e})
+      allSubqueries.foreach { e =>
+        val futureResult = scala.concurrent.future {
+          val df = DataFrame(sqlContext, e.query)
+          df.queryExecution.toRdd.collect()
+        }(SparkPlan.subqueryExecutionContext)
+        queryResults += e -> futureResult
+      }
+
       children.foreach(_.prepare())
+
+      val timeout: Duration = {
+        val timeoutValue = sqlContext.conf.broadcastTimeout
+        if (timeoutValue < 0) {
+          Duration.Inf
+        } else {
+          timeoutValue.seconds
+        }
+      }
+
+      // fill in the result of subqueries
+      queryResults.foreach {
+        case (e, futureResult) =>
+          val rows = Await.result(futureResult, timeout)
+          if (rows.length > 1) {
+            sys.error(s"Scalar subquery should return at most one row, but got ${rows.length}: " +
+              s"${e.query.treeString}")
+          }
+          // Analyzer will make sure that it only return on column
+          if (rows.length > 0) {
+            e.updateResult(rows(0).get(0, e.dataType))
+          }
+      }
     }
   }
 
@@ -231,6 +269,11 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
   }
 }
 
+object SparkPlan {
+  private[execution] val subqueryExecutionContext = ExecutionContext.fromExecutorService(
+    ThreadUtils.newDaemonCachedThreadPool("subquery", 16))
+}
+
 private[sql] trait LeafNode extends SparkPlan {
   override def children: Seq[SparkPlan] = Nil
   override def producedAttributes: AttributeSet = outputSet

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2105,6 +2105,22 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     assert(error.getMessage contains "grouping_id() can only be used with GroupingSets/Cube/Rollup")
   }
 
+  test("uncorrelated scalar subquery") {
+    assertResult(Array(Row(1))) {
+      sql("select (select 1 as b) as b").collect()
+    }
+
+    assertResult(Array(Row(1))) {
+      sql("with t2 as (select 1 as b, 2 as c) " +
+        "select a from (select 1 as a union all select 2 as a) t " +
+        "where a = (select max(b) from t2) ").collect()
+    }
+
+    assertResult(Array(Row(3))) {
+      sql("select (select (select 1) + 1) + 1").collect()
+    }
+  }
+
   test("SPARK-13056: Null in map value causes NPE") {
     val df = Seq(1 -> Map("abc" -> "somestring", "cba" -> null)).toDF("key", "value")
     withTempTable("maptest") {