Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into SPARK-24781
Browse files Browse the repository at this point in the history
  • Loading branch information
viirya committed Jul 12, 2018
2 parents 38a935d + 5ad4735 commit eff3af2
Show file tree
Hide file tree
Showing 109 changed files with 2,910 additions and 528 deletions.
5 changes: 5 additions & 0 deletions R/pkg/vignettes/sparkr-vignettes.Rmd
Expand Up @@ -590,6 +590,7 @@ summary(model)
Predict values on training data
```{r}
prediction <- predict(model, training)
head(select(prediction, "Class", "Sex", "Age", "Freq", "Survived", "prediction"))
```

#### Logistic Regression
Expand All @@ -613,6 +614,7 @@ summary(model)
Predict values on training data
```{r}
fitted <- predict(model, training)
head(select(fitted, "Class", "Sex", "Age", "Freq", "Survived", "prediction"))
```

Multinomial logistic regression against three classes
Expand Down Expand Up @@ -807,6 +809,7 @@ df <- createDataFrame(t)
dtModel <- spark.decisionTree(df, Survived ~ ., type = "classification", maxDepth = 2)
summary(dtModel)
predictions <- predict(dtModel, df)
head(select(predictions, "Class", "Sex", "Age", "Freq", "Survived", "prediction"))
```

#### Gradient-Boosted Trees
Expand All @@ -822,6 +825,7 @@ df <- createDataFrame(t)
gbtModel <- spark.gbt(df, Survived ~ ., type = "classification", maxDepth = 2, maxIter = 2)
summary(gbtModel)
predictions <- predict(gbtModel, df)
head(select(predictions, "Class", "Sex", "Age", "Freq", "Survived", "prediction"))
```

#### Random Forest
Expand All @@ -837,6 +841,7 @@ df <- createDataFrame(t)
rfModel <- spark.randomForest(df, Survived ~ ., type = "classification", maxDepth = 2, numTrees = 2)
summary(rfModel)
predictions <- predict(rfModel, df)
head(select(predictions, "Class", "Sex", "Age", "Freq", "Survived", "prediction"))
```

#### Bisecting k-Means
Expand Down
5 changes: 3 additions & 2 deletions core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
Expand Up @@ -385,7 +385,7 @@ private[spark] class SparkSubmit extends Logging {
val forceDownloadSchemes = sparkConf.get(FORCE_DOWNLOAD_SCHEMES)

def shouldDownload(scheme: String): Boolean = {
forceDownloadSchemes.contains(scheme) ||
forceDownloadSchemes.contains("*") || forceDownloadSchemes.contains(scheme) ||
Try { FileSystem.getFileSystemClass(scheme, hadoopConf) }.isFailure
}

Expand Down Expand Up @@ -578,7 +578,8 @@ private[spark] class SparkSubmit extends Logging {
}
// Add the main application jar and any added jars to classpath in case YARN client
// requires these jars.
// This assumes both primaryResource and user jars are local jars, otherwise it will not be
// This assumes both primaryResource and user jars are local jars, or already downloaded
// to local by configuring "spark.yarn.dist.forceDownloadSchemes", otherwise it will not be
// added to the classpath of YARN client.
if (isYarnCluster) {
if (isUserJar(args.primaryResource)) {
Expand Down
Expand Up @@ -233,30 +233,44 @@ private[spark] class RestSubmissionClient(master: String) extends Logging {
private[rest] def readResponse(connection: HttpURLConnection): SubmitRestProtocolResponse = {
import scala.concurrent.ExecutionContext.Implicits.global
val responseFuture = Future {
val dataStream =
if (connection.getResponseCode == HttpServletResponse.SC_OK) {
connection.getInputStream
} else {
connection.getErrorStream
val responseCode = connection.getResponseCode

if (responseCode != HttpServletResponse.SC_OK) {
val errString = Some(Source.fromInputStream(connection.getErrorStream())
.getLines().mkString("\n"))
if (responseCode == HttpServletResponse.SC_INTERNAL_SERVER_ERROR &&
!connection.getContentType().contains("application/json")) {
throw new SubmitRestProtocolException(s"Server responded with exception:\n${errString}")
}
logError(s"Server responded with error:\n${errString}")
val error = new ErrorResponse
if (responseCode == RestSubmissionServer.SC_UNKNOWN_PROTOCOL_VERSION) {
error.highestProtocolVersion = RestSubmissionServer.PROTOCOL_VERSION
}
error.message = errString.get
error
} else {
val dataStream = connection.getInputStream

// If the server threw an exception while writing a response, it will not have a body
if (dataStream == null) {
throw new SubmitRestProtocolException("Server returned empty body")
}
val responseJson = Source.fromInputStream(dataStream).mkString
logDebug(s"Response from the server:\n$responseJson")
val response = SubmitRestProtocolMessage.fromJson(responseJson)
response.validate()
response match {
// If the response is an error, log the message
case error: ErrorResponse =>
logError(s"Server responded with error:\n${error.message}")
error
// Otherwise, simply return the response
case response: SubmitRestProtocolResponse => response
case unexpected =>
throw new SubmitRestProtocolException(
s"Message received from server was not a response:\n${unexpected.toJson}")
}
// If the server threw an exception while writing a response, it will not have a body
if (dataStream == null) {
throw new SubmitRestProtocolException("Server returned empty body")
}
val responseJson = Source.fromInputStream(dataStream).mkString
logDebug(s"Response from the server:\n$responseJson")
val response = SubmitRestProtocolMessage.fromJson(responseJson)
response.validate()
response match {
// If the response is an error, log the message
case error: ErrorResponse =>
logError(s"Server responded with error:\n${error.message}")
error
// Otherwise, simply return the response
case response: SubmitRestProtocolResponse => response
case unexpected =>
throw new SubmitRestProtocolException(
s"Message received from server was not a response:\n${unexpected.toJson}")
}
}

Expand Down
Expand Up @@ -486,10 +486,11 @@ package object config {

private[spark] val FORCE_DOWNLOAD_SCHEMES =
ConfigBuilder("spark.yarn.dist.forceDownloadSchemes")
.doc("Comma-separated list of schemes for which files will be downloaded to the " +
.doc("Comma-separated list of schemes for which resources will be downloaded to the " +
"local disk prior to being added to YARN's distributed cache. For use in cases " +
"where the YARN service does not support schemes that are supported by Spark, like http, " +
"https and ftp.")
"https and ftp, or jars required to be in the local YARN client's classpath. Wildcard " +
"'*' is denoted to download resources for all the schemes.")
.stringConf
.toSequence
.createWithDefault(Nil)
Expand Down
2 changes: 1 addition & 1 deletion core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala
Expand Up @@ -30,7 +30,7 @@ private[spark]
class BlockRDD[T: ClassTag](sc: SparkContext, @transient val blockIds: Array[BlockId])
extends RDD[T](sc, Nil) {

@transient lazy val _locations = BlockManager.blockIdsToHosts(blockIds, SparkEnv.get)
@transient lazy val _locations = BlockManager.blockIdsToLocations(blockIds, SparkEnv.get)
@volatile private var _isValid = true

override def getPartitions: Array[Partition] = {
Expand Down
Expand Up @@ -45,6 +45,7 @@ import org.apache.spark.network.netty.SparkTransportConf
import org.apache.spark.network.shuffle.{ExternalShuffleClient, TempFileManager}
import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo
import org.apache.spark.rpc.RpcEnv
import org.apache.spark.scheduler.ExecutorCacheTaskLocation
import org.apache.spark.serializer.{SerializerInstance, SerializerManager}
import org.apache.spark.shuffle.ShuffleManager
import org.apache.spark.storage.memory._
Expand Down Expand Up @@ -1554,7 +1555,7 @@ private[spark] class BlockManager(
private[spark] object BlockManager {
private val ID_GENERATOR = new IdGenerator

def blockIdsToHosts(
def blockIdsToLocations(
blockIds: Array[BlockId],
env: SparkEnv,
blockManagerMaster: BlockManagerMaster = null): Map[BlockId, Seq[String]] = {
Expand All @@ -1569,7 +1570,9 @@ private[spark] object BlockManager {

val blockManagers = new HashMap[BlockId, Seq[String]]
for (i <- 0 until blockIds.length) {
blockManagers(blockIds(i)) = blockLocations(i).map(_.host)
blockManagers(blockIds(i)) = blockLocations(i).map { loc =>
ExecutorCacheTaskLocation(loc.host, loc.executorId).toString
}
}
blockManagers.toMap
}
Expand Down
Expand Up @@ -368,8 +368,8 @@ private[spark] class ExternalSorter[K, V, C](
val bufferedIters = iterators.filter(_.hasNext).map(_.buffered)
type Iter = BufferedIterator[Product2[K, C]]
val heap = new mutable.PriorityQueue[Iter]()(new Ordering[Iter] {
// Use the reverse of comparator.compare because PriorityQueue dequeues the max
override def compare(x: Iter, y: Iter): Int = -comparator.compare(x.head._1, y.head._1)
// Use the reverse order because PriorityQueue dequeues the max
override def compare(x: Iter, y: Iter): Int = comparator.compare(y.head._1, x.head._1)
})
heap.enqueue(bufferedIters: _*) // Will contain only the iterators with hasNext = true
new Iterator[Product2[K, C]] {
Expand Down
29 changes: 19 additions & 10 deletions core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
Expand Up @@ -995,20 +995,24 @@ class SparkSubmitSuite
}

test("download remote resource if it is not supported by yarn service") {
testRemoteResources(enableHttpFs = false, blacklistHttpFs = false)
testRemoteResources(enableHttpFs = false)
}

test("avoid downloading remote resource if it is supported by yarn service") {
testRemoteResources(enableHttpFs = true, blacklistHttpFs = false)
testRemoteResources(enableHttpFs = true)
}

test("force download from blacklisted schemes") {
testRemoteResources(enableHttpFs = true, blacklistHttpFs = true)
testRemoteResources(enableHttpFs = true, blacklistSchemes = Seq("http"))
}

test("force download for all the schemes") {
testRemoteResources(enableHttpFs = true, blacklistSchemes = Seq("*"))
}

private def testRemoteResources(
enableHttpFs: Boolean,
blacklistHttpFs: Boolean): Unit = {
blacklistSchemes: Seq[String] = Nil): Unit = {
val hadoopConf = new Configuration()
updateConfWithFakeS3Fs(hadoopConf)
if (enableHttpFs) {
Expand All @@ -1025,8 +1029,8 @@ class SparkSubmitSuite
val tmpHttpJar = TestUtils.createJarWithFiles(Map("test.resource" -> "USER"), tmpDir)
val tmpHttpJarPath = s"http://${new File(tmpHttpJar.toURI).getAbsolutePath}"

val forceDownloadArgs = if (blacklistHttpFs) {
Seq("--conf", "spark.yarn.dist.forceDownloadSchemes=http")
val forceDownloadArgs = if (blacklistSchemes.nonEmpty) {
Seq("--conf", s"spark.yarn.dist.forceDownloadSchemes=${blacklistSchemes.mkString(",")}")
} else {
Nil
}
Expand All @@ -1044,14 +1048,19 @@ class SparkSubmitSuite

val jars = conf.get("spark.yarn.dist.jars").split(",").toSet

// The URI of remote S3 resource should still be remote.
assert(jars.contains(tmpS3JarPath))
def isSchemeBlacklisted(scheme: String) = {
blacklistSchemes.contains("*") || blacklistSchemes.contains(scheme)
}

if (!isSchemeBlacklisted("s3")) {
assert(jars.contains(tmpS3JarPath))
}

if (enableHttpFs && !blacklistHttpFs) {
if (enableHttpFs && blacklistSchemes.isEmpty) {
// If Http FS is supported by yarn service, the URI of remote http resource should
// still be remote.
assert(jars.contains(tmpHttpJarPath))
} else {
} else if (!enableHttpFs || isSchemeBlacklisted("http")) {
// If Http FS is not supported by yarn service, or http scheme is configured to be force
// downloading, the URI of remote http resource should be changed to a local one.
val jarName = new File(tmpHttpJar.toURI).getName
Expand Down
Expand Up @@ -1422,6 +1422,19 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
assert(mockBlockTransferService.tempFileManager === store.remoteBlockTempFileManager)
}

test("query locations of blockIds") {
val mockBlockManagerMaster = mock(classOf[BlockManagerMaster])
val blockLocations = Seq(BlockManagerId("1", "host1", 100), BlockManagerId("2", "host2", 200))
when(mockBlockManagerMaster.getLocations(mc.any[Array[BlockId]]))
.thenReturn(Array(blockLocations))
val env = mock(classOf[SparkEnv])

val blockIds: Array[BlockId] = Array(StreamBlockId(1, 2))
val locs = BlockManager.blockIdsToLocations(blockIds, env, mockBlockManagerMaster)
val expectedLocs = Seq("executor_host1_1", "executor_host2_2")
assert(locs(blockIds(0)) == expectedLocs)
}

class MockBlockTransferService(val maxFailures: Int) extends BlockTransferService {
var numCalls = 0
var tempFileManager: TempFileManager = null
Expand Down
1 change: 1 addition & 0 deletions dev/requirements.txt
Expand Up @@ -2,3 +2,4 @@ jira==1.0.3
PyGithub==1.26.0
Unidecode==0.04.19
pypandoc==1.3.3
sphinx
28 changes: 28 additions & 0 deletions docs/ml-statistics.md
Expand Up @@ -89,4 +89,32 @@ Refer to the [`ChiSquareTest` Python docs](api/python/index.html#pyspark.ml.stat
{% include_example python/ml/chi_square_test_example.py %}
</div>

</div>

## Summarizer

We provide vector column summary statistics for `Dataframe` through `Summarizer`.
Available metrics are the column-wise max, min, mean, variance, and number of nonzeros, as well as the total count.

<div class="codetabs">
<div data-lang="scala" markdown="1">
The following example demonstrates using [`Summarizer`](api/scala/index.html#org.apache.spark.ml.stat.Summarizer$)
to compute the mean and variance for a vector column of the input dataframe, with and without a weight column.

{% include_example scala/org/apache/spark/examples/ml/SummarizerExample.scala %}
</div>

<div data-lang="java" markdown="1">
The following example demonstrates using [`Summarizer`](api/java/org/apache/spark/ml/stat/Summarizer.html)
to compute the mean and variance for a vector column of the input dataframe, with and without a weight column.

{% include_example java/org/apache/spark/examples/ml/JavaSummarizerExample.java %}
</div>

<div data-lang="python" markdown="1">
Refer to the [`Summarizer` Python docs](api/python/index.html#pyspark.ml.stat.Summarizer$) for details on the API.

{% include_example python/ml/summarizer_example.py %}
</div>

</div>
48 changes: 48 additions & 0 deletions docs/running-on-kubernetes.md
Expand Up @@ -629,6 +629,54 @@ specific to Spark on Kubernetes.
Add as an environment variable to the executor container with name EnvName (case sensitive), the value referenced by key <code> key </code> in the data of the referenced <a href="https://kubernetes.io/docs/concepts/configuration/secret/#using-secrets-as-environment-variables">Kubernetes Secret</a>. For example,
<code>spark.kubernetes.executor.secrets.ENV_VAR=spark-secret:key</code>.
</td>
</tr>
<tr>
<td><code>spark.kubernetes.driver.volumes.[VolumeType].[VolumeName].mount.path</code></td>
<td>(none)</td>
<td>
Add the <a href="https://kubernetes.io/docs/concepts/storage/volumes/">Kubernetes Volume</a> named <code>VolumeName</code> of the <code>VolumeType</code> type to the driver pod on the path specified in the value. For example,
<code>spark.kubernetes.driver.volumes.persistentVolumeClaim.checkpointpvc.mount.path=/checkpoint</code>.
</td>
</tr>
<tr>
<td><code>spark.kubernetes.driver.volumes.[VolumeType].[VolumeName].mount.readOnly</code></td>
<td>(none)</td>
<td>
Specify if the mounted volume is read only or not. For example,
<code>spark.kubernetes.driver.volumes.persistentVolumeClaim.checkpointpvc.mount.readOnly=false</code>.
</td>
</tr>
<tr>
<td><code>spark.kubernetes.driver.volumes.[VolumeType].[VolumeName].options.[OptionName]</code></td>
<td>(none)</td>
<td>
Configure <a href="https://kubernetes.io/docs/concepts/storage/volumes/">Kubernetes Volume</a> options passed to the Kubernetes with <code>OptionName</code> as key having specified value, must conform with Kubernetes option format. For example,
<code>spark.kubernetes.driver.volumes.persistentVolumeClaim.checkpointpvc.options.claimName=spark-pvc-claim</code>.
</td>
</tr>
<tr>
<td><code>spark.kubernetes.executor.volumes.[VolumeType].[VolumeName].mount.path</code></td>
<td>(none)</td>
<td>
Add the <a href="https://kubernetes.io/docs/concepts/storage/volumes/">Kubernetes Volume</a> named <code>VolumeName</code> of the <code>VolumeType</code> type to the executor pod on the path specified in the value. For example,
<code>spark.kubernetes.executor.volumes.persistentVolumeClaim.checkpointpvc.mount.path=/checkpoint</code>.
</td>
</tr>
<tr>
<td><code>spark.kubernetes.executor.volumes.[VolumeType].[VolumeName].mount.readOnly</code></td>
<td>false</td>
<td>
Specify if the mounted volume is read only or not. For example,
<code>spark.kubernetes.executor.volumes.persistentVolumeClaim.checkpointpvc.mount.readOnly=false</code>.
</td>
</tr>
<tr>
<td><code>spark.kubernetes.executor.volumes.[VolumeType].[VolumeName].options.[OptionName]</code></td>
<td>(none)</td>
<td>
Configure <a href="https://kubernetes.io/docs/concepts/storage/volumes/">Kubernetes Volume</a> options passed to the Kubernetes with <code>OptionName</code> as key having specified value. For example,
<code>spark.kubernetes.executor.volumes.persistentVolumeClaim.checkpointpvc.options.claimName=spark-pvc-claim</code>.
</td>
</tr>
<tr>
<td><code>spark.kubernetes.memoryOverheadFactor</code></td>
Expand Down
5 changes: 3 additions & 2 deletions docs/running-on-yarn.md
Expand Up @@ -218,9 +218,10 @@ To use a custom metrics.properties for the application master and executors, upd
<td><code>spark.yarn.dist.forceDownloadSchemes</code></td>
<td><code>(none)</code></td>
<td>
Comma-separated list of schemes for which files will be downloaded to the local disk prior to
Comma-separated list of schemes for which resources will be downloaded to the local disk prior to
being added to YARN's distributed cache. For use in cases where the YARN service does not
support schemes that are supported by Spark, like http, https and ftp.
support schemes that are supported by Spark, like http, https and ftp, or jars required to be in the
local YARN client's classpath. Wildcard '*' is denoted to download resources for all the schemes.
</td>
</tr>
<tr>
Expand Down

0 comments on commit eff3af2

Please sign in to comment.