From 6e2f53238856942e40a3301108f37a3a5cc17bca Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 5 Oct 2017 11:20:48 -0700
Subject: [PATCH 1/3] fix.

---
 .../sql/execution/WholeStageCodegenExec.scala | 12 ++++++----
 .../execution/WholeStageCodegenSuite.scala    | 22 +++++++++++++++++--
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index 9073d599ac43d..30ef2a5e31665 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -392,12 +392,16 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
 
     // Check if compiled code has a too large function
     if (maxCodeSize > sqlContext.conf.hugeMethodLimit) {
-      logWarning(s"Found too long generated codes and JIT optimization might not work: " +
-        s"the bytecode size was $maxCodeSize, this value went over the limit " +
+      logInfo(s"Found too long generated codes and JIT optimization might not work: " +
+        s"the bytecode size ($maxCodeSize) is above the limit " +
         s"${sqlContext.conf.hugeMethodLimit}, and the whole-stage codegen was disabled " +
         s"for this plan. To avoid this, you can raise the limit " +
-        s"${SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.key}:\n$treeString")
-      return child.execute()
+        s"`${SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.key}`:\n$treeString")
+      child match {
+        // For batch file source scan, we should continue executing it
+        case f: FileSourceScanExec if f.supportsBatch => // do nothing
+        case _ => child.execute()
+      }
     }
 
     val references = ctx.references.toArray
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
index aaa77b3ee6201..00bec337af63f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.{QueryTest, Row, SaveMode}
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeAndComment, CodeGenerator}
 import org.apache.spark.sql.execution.aggregate.HashAggregateExec
 import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec
@@ -28,7 +28,7 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
 
-class WholeStageCodegenSuite extends SparkPlanTest with SharedSQLContext {
+class WholeStageCodegenSuite extends QueryTest with SharedSQLContext {
 
   test("range/filter should be combined") {
     val df = spark.range(10).filter("id = 1").selectExpr("id + 1")
@@ -185,4 +185,22 @@ class WholeStageCodegenSuite extends SparkPlanTest with SharedSQLContext {
     val (_, maxCodeSize2) = CodeGenerator.compile(codeWithLongFunctions)
     assert(maxCodeSize2 > SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.defaultValue.get)
   }
+
+  test("returning batch for wide table") {
+    import testImplicits._
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val df = spark.range(10).select(Seq.tabulate(201) {i => ('id + i).as(s"c$i")} : _*)
+      df.write.mode(SaveMode.Overwrite).parquet(path)
+
+      withSQLConf(SQLConf.WHOLESTAGE_MAX_NUM_FIELDS.key -> "202",
+        SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.key -> "8000") {
+        // donot return batch, because whole stage codegen is disabled for wide table (>202 columns)
+        val df2 = spark.read.parquet(path)
+        val fileScan2 = df2.queryExecution.sparkPlan.find(_.isInstanceOf[FileSourceScanExec]).get
+        assert(fileScan2.asInstanceOf[FileSourceScanExec].supportsBatch)
+        checkAnswer(df2, df)
+      }
+    }
+  }
 }

From 473bbf0057f50ca14bd3fe23a433d68e7b9c1c48 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 5 Oct 2017 11:29:25 -0700
Subject: [PATCH 2/3] fix.

---
 .../org/apache/spark/sql/execution/WholeStageCodegenExec.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index 30ef2a5e31665..4b9c5afb2cf3f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -400,7 +400,7 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
       child match {
         // For batch file source scan, we should continue executing it
         case f: FileSourceScanExec if f.supportsBatch => // do nothing
-        case _ => child.execute()
+        case _ => return child.execute()
       }
     }
 

From b8eb6a0e45ceb9592fbbf32a236aa17cd3e5dac0 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 5 Oct 2017 20:50:29 -0700
Subject: [PATCH 3/3] fix.

---
 .../apache/spark/sql/execution/WholeStageCodegenExec.scala | 2 +-
 .../spark/sql/execution/WholeStageCodegenSuite.scala       | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index 4b9c5afb2cf3f..1aaaf896692d1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -398,7 +398,7 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
         s"for this plan. To avoid this, you can raise the limit " +
         s"`${SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.key}`:\n$treeString")
       child match {
-        // For batch file source scan, we should continue executing it
+        // The fallback solution of batch file source scan still uses WholeStageCodegenExec
         case f: FileSourceScanExec if f.supportsBatch => // do nothing
         case _ => return child.execute()
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
index 00bec337af63f..098e4cfeb15b2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -186,7 +186,7 @@ class WholeStageCodegenSuite extends QueryTest with SharedSQLContext {
     assert(maxCodeSize2 > SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.defaultValue.get)
   }
 
-  test("returning batch for wide table") {
+  test("bytecode of batch file scan exceeds the limit of WHOLESTAGE_HUGE_METHOD_LIMIT") {
     import testImplicits._
     withTempPath { dir =>
       val path = dir.getCanonicalPath
@@ -194,8 +194,9 @@ class WholeStageCodegenSuite extends QueryTest with SharedSQLContext {
       df.write.mode(SaveMode.Overwrite).parquet(path)
 
       withSQLConf(SQLConf.WHOLESTAGE_MAX_NUM_FIELDS.key -> "202",
-        SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.key -> "8000") {
-        // donot return batch, because whole stage codegen is disabled for wide table (>202 columns)
+        SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.key -> "2000") {
+        // wide table batch scan causes the byte code of codegen exceeds the limit of
+        // WHOLESTAGE_HUGE_METHOD_LIMIT
         val df2 = spark.read.parquet(path)
         val fileScan2 = df2.queryExecution.sparkPlan.find(_.isInstanceOf[FileSourceScanExec]).get
         assert(fileScan2.asInstanceOf[FileSourceScanExec].supportsBatch)