Merge branch 'master' into SPARK-31957

apache · Jun 19, 2020 · 0b05d41 · 0b05d41
2 parents faf782e + 6fe3bf6
commit 0b05d41
Show file tree

Hide file tree

Showing 58 changed files with 1,029 additions and 384 deletions.
diff --git a/dev/.rat-excludes b/dev/.rat-excludes
@@ -123,3 +123,4 @@ SessionManager.java
 SessionHandler.java
 GangliaReporter.java
 application_1578436911597_0052
+config.properties
diff --git a/dev/create-release/known_translations b/dev/create-release/known_translations
@@ -261,3 +261,151 @@ yanlin-Lynn - Yanlin Wang
 yucai - Yucai Yu
 zhengruifeng - Ruifeng Zheng
 zuotingbing - Tingbing Zuo
+012huang - Weiyi Huang
+07ARB - Ankit Raj Boudh
+Andrew-Crosby - Andrew Crosby
+AngersZhuuuu - Yi Zhu
+Deegue - Yizhong Zhang
+Gschiavon - German Schiavon Matteo
+GuoPhilipse - Philipse Guo
+Hellsen83 - Erik Christiansen
+Icysandwich - Icysandwich
+JasonWayne - Wenjie Wu
+JkSelf - Ke Jia
+JoanFM - Joan Fontanals
+JulienPeloton - Julien Peloton
+Koraseg - Artem Kupchinskiy
+KyleLi1985 - Liang Li
+LiShuMing - Shuming Li
+LinhongLiu - Liu, Linhong
+LuciferYang - Yang Jie
+MaxGekk - Maxim Gekk
+Ngone51 - Yi Wu
+PavithraRamachandran - Pavithra Ramachandran
+SongYadong - Yadong Song
+TigerYang414 - David Yang
+TomokoKomiyama - Tomoko Komiyama
+TopGunViper - TopGunViper
+Udbhav30 - Udbhav Agrawal
+WangGuangxin - Guangxin Wang
+William1104 - William Wong
+YongjinZhou - Yongjin Zhou
+aaruna - Aaruna Godthi
+adrian555 - Weiqiang Zhuang
+ajithme - Ajith S
+amanomer - Aman Omer
+ancasarb - Anca Sarb
+avkgh - Aleksandr Kashkirov
+ayudovin - Artsiom Yudovin
+bartosz25 - Bartosz Konieczny
+beliefer - Jiaan Geng
+bettermouse - Chen Hao
+bscan - Brian Scannell
+cchung100m - Neo Chien
+cclauss - Christian Clauss
+chakravarthiT - Chakravarthi
+chandulal - Chandu Kavar
+chitralverma - Chitral Verma
+cjn082030 - Jenny
+cloud-fan - Wenchen Fan
+codeborui - codeborui
+colinmjj - Colin Ma
+cxzl25 - cxzl25
+cyq89051127 - Yongqiang Chai
+darrentirto - Darren Tirto
+daviddingly - Xiaoyuan Ding
+davidvrba - David Vrba
+deepyaman - Deepyaman Datta
+denglingang - Lingang Deng
+dengziming - dengziming
+deshanxiao - deshanxiao
+dima-asana - Dima Kamalov
+dlindelof - David Lindelof
+dongjoon-hyun - Dongjoon Hyun
+eatoncys - eatoncys
+fan31415 - Yijie Fan
+fitermay - Yuli Fiterman
+francis0407 - Mingcong Han
+fuwhu - Fuwang Hu
+gss2002 - Greg Senia
+hddong - Dongdong Hong
+hehuiyuan - hehuiyuan
+helenyugithub - Helen Yu
+highmoutain - highmoutain
+httfighter - Tiantian Han
+huangtianhua - huangtianhua
+hvanhovell - Herman Van Hovell
+iRakson - Rakesh Raushan
+igorcalabria - Igor Calabria
+imback82 - Terry Kim
+javierivanov - Javier Fuentes
+joelgenter - Joel Genter
+ketank-new - Ketan Kunde
+laskfla - Keith Sun
+lcqzte10192193 - Chaoqun Li
+leoluan2009 - Xuedong Luan
+liangxs - Xuesen Liang
+lidinghao - Li Hao
+linehrr - Ryne Yang
+linzebing - Zebing Lin
+lipzhu - Lipeng Zhu
+liucht-inspur - liucht-inspur
+liupc - Pengcheng Liu
+liwensun - Liwen Sun
+manuzhang - Manu Zhang
+mareksimunek - Marek Simunek
+masa3141 - Masahiro Kazama
+mdianjun - Dianjun Ma
+merrily01 - Ruilei Ma
+mob-ai - mob-ai
+mu5358271 - Shuheng Dai
+mwlon - Martin Loncaric
+nandorKollar - Nandor Kollar
+nooberfsh - nooberfsh
+oleg-smith - Oleg Kuznetsov
+ozancicek - Ozan Cicekci
+pengbo - Peng Bo
+planga82 - Pablo Langa Blanco
+praneetsharma - Praneet Sharma
+ptkool - Michael Styles
+qb-tarushg - Tarush Grover
+redsanket - Sanket Reddy
+redsk - Nicola Bova
+roland1982 - roland1982
+rongma1997 - Rong Ma
+rrusso2007 - Rob Russo
+samsetegne - Samuel L. Setegne
+sangramga - Sangram Gaikwad
+sarthfrey - Sarth Frey
+seayoun - Haiyang Yu
+sev7e0 - Jiaqi Li
+shahidki31 - Shahid
+sharangk - Sharanabasappa G Keriwaddi
+sheepstop - Ting Yang
+shivsood - Shiv Prashant Sood
+sitegui - Guilherme Souza
+slamke - Sun Ke
+southernriver - Liang Chen
+squito - Imran Rashid
+stczwd - Jackey Lee
+sujith71955 - Sujith Chacko
+suxingfate - Xinglong Wang
+teeyog - teeyog
+tinhto-000 - Tin Hang To
+tools4origins - tools4origins
+triplesheep - triplesheep
+turboFei - Fei Wang
+ulysses-you - ulysses-you
+uzadude - Ohad Raviv
+wackxu - wackxu
+wangjiaochun - wangjiaochun
+wangshisan - wangshisan
+weixiuli - XiuLi Wei
+wenfang6 - wenfang6
+wenxuanguan - wenxuanguan
+windpiger - Song Jun
+woudygao - Woudy Gao
+xianyinxin - Xianyin Xin
+yunzoud - Yun Zou
+zero323 - Maciej Szymkiewicz
+zjf2012 - Jiafu Zhang
diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
@@ -1341,7 +1341,7 @@ The following affect the driver and executor containers. All other containers in
   <td>See description</td>
   <td>
     The container name will be assigned by spark ("spark-kubernetes-driver" for the driver container, and
-    "executor" for each executor container) if not defined by the pod template. If the container is defined by the
+    "spark-kubernetes-executor" for each executor container) if not defined by the pod template. If the container is defined by the
     template, the template's name will be used.
   </td>
 </tr>

diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
@@ -82,6 +82,18 @@ In `cluster` mode, the driver runs on a different machine than the client, so `S
 
 Running Spark on YARN requires a binary distribution of Spark which is built with YARN support.
 Binary distributions can be downloaded from the [downloads page](https://spark.apache.org/downloads.html) of the project website.
+There are two variants of Spark binary distributions you can download. One is pre-built with a certain
+version of Apache Hadoop; this Spark distribution contains built-in Hadoop runtime, so we call it `with-hadoop` Spark
+distribution. The other one is pre-built with user-provided Hadoop; since this Spark distribution
+doesn't contain a built-in Hadoop runtime, it's smaller, but users have to provide a Hadoop installation separately.
+We call this variant `no-hadoop` Spark distribution. For `with-hadoop` Spark distribution, since
+it contains a built-in Hadoop runtime already, by default, when a job is submitted to Hadoop Yarn cluster, to prevent jar conflict, it will not
+populate Yarn's classpath into Spark. To override this behavior, you can set <code>spark.yarn.populateHadoopClasspath=true</code>.
+For `no-hadoop` Spark distribution, Spark will populate Yarn's classpath by default in order to get Hadoop runtime. For `with-hadoop` Spark distribution,
+if your application depends on certain library that is only available in the cluster, you can try to populate the Yarn classpath by setting
+the property mentioned above. If you run into jar conflict issue by doing so, you will need to turn it off and include this library
+in your application jar.
+
 To build Spark yourself, refer to [Building Spark](building-spark.html).
 
 To make Spark runtime jars accessible from YARN side, you can specify `spark.yarn.archive` or `spark.yarn.jars`. For details please refer to [Spark Properties](running-on-yarn.html#spark-properties). If neither `spark.yarn.archive` nor `spark.yarn.jars` is specified, Spark will create a zip file with all jars under `$SPARK_HOME/jars` and upload it to the distributed cache.
@@ -396,7 +408,10 @@ To use a custom metrics.properties for the application master and executors, upd
 </tr>
 <tr>
   <td><code>spark.yarn.populateHadoopClasspath</code></td>
-  <td>true</td>
+  <td>
+    For <code>with-hadoop</code> Spark distribution, this is set to false; 
+    for <code>no-hadoop</code> distribution, this is set to true.
+  </td>
   <td>
     Whether to populate Hadoop classpath from <code>yarn.application.classpath</code> and
     <code>mapreduce.application.classpath</code> Note that if this is set to <code>false</code>, 

diff --git a/docs/web-ui.md b/docs/web-ui.md
@@ -435,10 +435,10 @@ The tracked operations are listed as follows.
 As an early-release version, the statistics page is still under development and will be improved in
 future releases.
 
-## Streaming Tab
-The web UI includes a Streaming tab if the application uses Spark streaming. This tab displays
-scheduling delay and processing time for each micro-batch in the data stream, which can be useful
-for troubleshooting the streaming application.
+## Streaming (DStreams) Tab
+The web UI includes a Streaming tab if the application uses Spark Streaming with DStream API.
+This tab displays scheduling delay and processing time for each micro-batch in the data stream,
+which can be useful for troubleshooting the streaming application.
 
 ## JDBC/ODBC Server Tab
 We can see this tab when Spark is running as a [distributed SQL engine](sql-distributed-sql-engine.html). It shows information about sessions and submitted SQL operations.

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
@@ -480,7 +480,6 @@ object SparkParallelTestGrouping {
     "org.apache.spark.sql.hive.thriftserver.SparkSQLEnvSuite",
     "org.apache.spark.sql.hive.thriftserver.ui.ThriftServerPageSuite",
     "org.apache.spark.sql.hive.thriftserver.ui.HiveThriftServer2ListenerSuite",
-    "org.apache.spark.sql.hive.thriftserver.ThriftServerWithSparkContextSuite",
     "org.apache.spark.sql.kafka010.KafkaDelegationTokenSuite"
   )
 

diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
@@ -30,8 +30,18 @@
   <properties>
     <sbt.project.name>yarn</sbt.project.name>
     <jersey-1.version>1.19</jersey-1.version>
+    <spark.yarn.isHadoopProvided>false</spark.yarn.isHadoopProvided>
   </properties>
 
+  <profiles>
+    <profile>
+      <id>hadoop-provided</id>
+      <properties>
+        <spark.yarn.isHadoopProvided>true</spark.yarn.isHadoopProvided>
+      </properties>
+    </profile>
+  </profiles>
+
   <dependencies>
     <dependency>
       <groupId>org.apache.spark</groupId>
@@ -201,6 +211,12 @@
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+    <resources>
+      <resource>
+        <directory>src/main/resources</directory>
+        <filtering>true</filtering>
+      </resource>
+    </resources>
   </build>
 
 </project>
diff --git a/resource-managers/yarn/src/main/resources/org/apache/spark/deploy/yarn/config.properties b/resource-managers/yarn/src/main/resources/org/apache/spark/deploy/yarn/config.properties
@@ -0,0 +1 @@
+spark.yarn.isHadoopProvided = ${spark.yarn.isHadoopProvided}
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala
@@ -17,12 +17,14 @@
 
 package org.apache.spark.deploy.yarn
 
+import java.util.Properties
 import java.util.concurrent.TimeUnit
 
+import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.ConfigBuilder
 import org.apache.spark.network.util.ByteUnit
 
-package object config {
+package object config extends Logging {
 
   /* Common app configuration. */
 
@@ -74,10 +76,11 @@ package object config {
     .doc("Whether to populate Hadoop classpath from `yarn.application.classpath` and " +
       "`mapreduce.application.classpath` Note that if this is set to `false`, it requires " +
       "a `with-Hadoop` Spark distribution that bundles Hadoop runtime or user has to provide " +
-      "a Hadoop installation separately.")
+      "a Hadoop installation separately. By default, for `with-hadoop` Spark distribution, " +
+      "this is set to `false`; for `no-hadoop` distribution, this is set to `true`.")
     .version("2.4.6")
     .booleanConf
-    .createWithDefault(true)
+    .createWithDefault(isHadoopProvided())
 
   private[spark] val GATEWAY_ROOT_PATH = ConfigBuilder("spark.yarn.config.gatewayPath")
     .doc("Root of configuration paths that is present on gateway nodes, and will be replaced " +
@@ -394,4 +397,20 @@ package object config {
   private[yarn] val YARN_DRIVER_RESOURCE_TYPES_PREFIX = "spark.yarn.driver.resource."
   private[yarn] val YARN_AM_RESOURCE_TYPES_PREFIX = "spark.yarn.am.resource."
 
+  def isHadoopProvided(): Boolean = IS_HADOOP_PROVIDED
+
+  private lazy val IS_HADOOP_PROVIDED: Boolean = {
+    val configPath = "org/apache/spark/deploy/yarn/config.properties"
+    val propertyKey = "spark.yarn.isHadoopProvided"
+    try {
+      val prop = new Properties()
+      prop.load(ClassLoader.getSystemClassLoader.getResourceAsStream(configPath))
+      prop.getProperty(propertyKey).toBoolean
+    } catch {
+      case e: Exception =>
+        log.warn(s"Can not load the default value of `$propertyKey` from " +
+          s"`$configPath` with error, ${e.toString}. Using `false` as a default value.")
+        false
+    }
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -303,7 +303,7 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
     case BinaryType => buildCast[Array[Byte]](_, UTF8String.fromBytes)
     case DateType => buildCast[Int](_, d => UTF8String.fromString(dateFormatter.format(d)))
     case TimestampType => buildCast[Long](_,
-      t => UTF8String.fromString(DateTimeUtils.timestampToString(timestampFormatter, t)))
+      t => UTF8String.fromString(timestampFormatter.format(t)))
     case ArrayType(et, _) =>
       buildCast[ArrayData](_, array => {
         val builder = new UTF8StringBuilder
@@ -443,7 +443,7 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
     case ByteType =>
       buildCast[Byte](_, b => longToTimestamp(b.toLong))
     case DateType =>
-      buildCast[Int](_, d => epochDaysToMicros(d, zoneId))
+      buildCast[Int](_, d => daysToMicros(d, zoneId))
     // TimestampWritable.decimalToTimestamp
     case DecimalType() =>
       buildCast[Decimal](_, d => decimalToTimestamp(d))
@@ -480,7 +480,7 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
     case TimestampType =>
       // throw valid precision more than seconds, according to Hive.
       // Timestamp.nanos is in 0 to 999,999,999, no more than a second.
-      buildCast[Long](_, t => microsToEpochDays(t, zoneId))
+      buildCast[Long](_, t => microsToDays(t, zoneId))
   }
 
   // IntervalConverter
@@ -1034,8 +1034,7 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
         val tf = JavaCode.global(
           ctx.addReferenceObj("timestampFormatter", timestampFormatter),
           timestampFormatter.getClass)
-        (c, evPrim, evNull) => code"""$evPrim = UTF8String.fromString(
-          org.apache.spark.sql.catalyst.util.DateTimeUtils.timestampToString($tf, $c));"""
+        (c, evPrim, evNull) => code"""$evPrim = UTF8String.fromString($tf.format($c));"""
       case CalendarIntervalType =>
         (c, evPrim, _) => code"""$evPrim = UTF8String.fromString($c.toString());"""
       case ArrayType(et, _) =>
@@ -1120,7 +1119,7 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
         val zid = getZoneId()
         (c, evPrim, evNull) =>
           code"""$evPrim =
-            org.apache.spark.sql.catalyst.util.DateTimeUtils.microsToEpochDays($c, $zid);"""
+            org.apache.spark.sql.catalyst.util.DateTimeUtils.microsToDays($c, $zid);"""
       case _ =>
         (c, evPrim, evNull) => code"$evNull = true;"
     }
@@ -1247,7 +1246,7 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
         zoneIdClass)
       (c, evPrim, evNull) =>
         code"""$evPrim =
-          org.apache.spark.sql.catalyst.util.DateTimeUtils.epochDaysToMicros($c, $zid);"""
+          org.apache.spark.sql.catalyst.util.DateTimeUtils.daysToMicros($c, $zid);"""
     case DecimalType() =>
       (c, evPrim, evNull) => code"$evPrim = ${decimalToTimestampCode(c)};"
     case DoubleType =>