Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-24266][k8s] Restart the watcher when we receive a version changed from k8s #28423

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,11 @@ import java.util.{Collections, UUID}
import java.util.Properties

import io.fabric8.kubernetes.api.model._
import io.fabric8.kubernetes.client.KubernetesClient
import io.fabric8.kubernetes.client.{KubernetesClient, Watch}
import io.fabric8.kubernetes.client.Watcher.Action
import scala.collection.mutable
import scala.util.control.NonFatal
import util.control.Breaks._

import org.apache.spark.SparkConf
import org.apache.spark.deploy.SparkApplication
Expand Down Expand Up @@ -127,25 +129,37 @@ private[spark] class Client(
.endSpec()
.build()
val driverPodName = resolvedDriverPod.getMetadata.getName
Utils.tryWithResource(
kubernetesClient
.pods()
.withName(driverPodName)
.watch(watcher)) { _ =>
val createdDriverPod = kubernetesClient.pods().create(resolvedDriverPod)
try {
val otherKubernetesResources =
resolvedDriverSpec.driverKubernetesResources ++ Seq(configMap)
addDriverOwnerReference(createdDriverPod, otherKubernetesResources)
kubernetesClient.resourceList(otherKubernetesResources: _*).createOrReplace()
} catch {
case NonFatal(e) =>
kubernetesClient.pods().delete(createdDriverPod)
throw e
}

val sId = Seq(conf.namespace, driverPodName).mkString(":")
watcher.watchOrStop(sId)
var watch: Watch = null
val createdDriverPod = kubernetesClient.pods().create(resolvedDriverPod)
try {
val otherKubernetesResources = resolvedDriverSpec.driverKubernetesResources ++ Seq(configMap)
addDriverOwnerReference(createdDriverPod, otherKubernetesResources)
kubernetesClient.resourceList(otherKubernetesResources: _*).createOrReplace()
} catch {
case NonFatal(e) =>
kubernetesClient.pods().delete(createdDriverPod)
throw e
}
val sId = Seq(conf.namespace, driverPodName).mkString(":")
breakable {
while (true) {
val podWithName = kubernetesClient
.pods()
.withName(driverPodName)
// Reset resource to old before we start the watch, this is important for race conditions
watcher.reset()
watch = podWithName.watch(watcher)

// Send the latest pod state we know to the watcher to make sure we didn't miss anything
watcher.eventReceived(Action.MODIFIED, podWithName.get())

// Break the while loop if the pod is completed or we don't want to wait
if(watcher.watchOrStop(sId)) {
watch.close()
break
}
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,16 @@ package org.apache.spark.deploy.k8s.submit
import io.fabric8.kubernetes.api.model.Pod
import io.fabric8.kubernetes.client.{KubernetesClientException, Watcher}
import io.fabric8.kubernetes.client.Watcher.Action
import java.net.HttpURLConnection.HTTP_GONE

import org.apache.spark.deploy.k8s.Config._
import org.apache.spark.deploy.k8s.KubernetesDriverConf
import org.apache.spark.deploy.k8s.KubernetesUtils._
import org.apache.spark.internal.Logging

private[k8s] trait LoggingPodStatusWatcher extends Watcher[Pod] {
def watchOrStop(submissionId: String): Unit
def watchOrStop(submissionId: String): Boolean
def reset(): Unit
}

/**
Expand All @@ -42,10 +44,16 @@ private[k8s] class LoggingPodStatusWatcherImpl(conf: KubernetesDriverConf)

private var podCompleted = false

private var resourceTooOldReceived = false

private var pod = Option.empty[Pod]

private def phase: String = pod.map(_.getStatus.getPhase).getOrElse("unknown")

override def reset(): Unit = {
resourceTooOldReceived = false
}

override def eventReceived(action: Action, pod: Pod): Unit = {
this.pod = Option(pod)
action match {
Expand All @@ -62,7 +70,12 @@ private[k8s] class LoggingPodStatusWatcherImpl(conf: KubernetesDriverConf)

override def onClose(e: KubernetesClientException): Unit = {
logDebug(s"Stopping watching application $appId with last-observed phase $phase")
closeWatch()
if(e != null && e.getCode == HTTP_GONE) {
resourceTooOldReceived = true
logDebug(s"Got HTTP Gone code, resource version changed in k8s api: $e")
} else {
closeWatch()
}
}

private def logLongStatus(): Unit = {
Expand All @@ -78,20 +91,26 @@ private[k8s] class LoggingPodStatusWatcherImpl(conf: KubernetesDriverConf)
this.notifyAll()
}

override def watchOrStop(sId: String): Unit = if (conf.get(WAIT_FOR_APP_COMPLETION)) {
override def watchOrStop(sId: String): Boolean = if (conf.get(WAIT_FOR_APP_COMPLETION)) {
logInfo(s"Waiting for application ${conf.appName} with submission ID $sId to finish...")
val interval = conf.get(REPORT_INTERVAL)
synchronized {
while (!podCompleted) {
while (!podCompleted && !resourceTooOldReceived) {
wait(interval)
logInfo(s"Application status for $appId (phase: $phase)")
}
}
logInfo(
pod.map { p => s"Container final statuses:\n\n${containersDescription(p)}" }
.getOrElse("No containers were found in the driver pod."))
logInfo(s"Application ${conf.appName} with submission ID $sId finished")

if(podCompleted) {
logInfo(
pod.map { p => s"Container final statuses:\n\n${containersDescription(p)}" }
.getOrElse("No containers were found in the driver pod."))
logInfo(s"Application ${conf.appName} with submission ID $sId finished")
}
podCompleted
} else {
logInfo(s"Deployed Spark application ${conf.appName} with submission ID $sId into Kubernetes")
// Always act like the application has completed since we don't want to wait for app completion
true
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ class ClientSuite extends SparkFunSuite with BeforeAndAfter {
createdResourcesArgumentCaptor = ArgumentCaptor.forClass(classOf[HasMetadata])
when(podOperations.create(FULL_EXPECTED_POD)).thenReturn(POD_WITH_OWNER_REFERENCE)
when(namedPods.watch(loggingPodStatusWatcher)).thenReturn(mock[Watch])
when(loggingPodStatusWatcher.watchOrStop(kconf.namespace + ":" + POD_NAME)).thenReturn(true)
doReturn(resourceList)
.when(kubernetesClient)
.resourceList(createdResourcesArgumentCaptor.capture())
Expand Down