New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-24309][CORE] AsyncEventQueue should stop on interrupt. #21356
Changes from 3 commits
a689f52
0a44c06
4a1f657
09d55af
fc8a197
008d14d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,7 +34,11 @@ import org.apache.spark.util.Utils | |
* Delivery will only begin when the `start()` method is called. The `stop()` method should be | ||
* called when no more events need to be delivered. | ||
*/ | ||
private class AsyncEventQueue(val name: String, conf: SparkConf, metrics: LiveListenerBusMetrics) | ||
private class AsyncEventQueue( | ||
val name: String, | ||
conf: SparkConf, | ||
metrics: LiveListenerBusMetrics, | ||
bus: LiveListenerBus) | ||
extends SparkListenerBus | ||
with Logging { | ||
|
||
|
@@ -81,23 +85,18 @@ private class AsyncEventQueue(val name: String, conf: SparkConf, metrics: LiveLi | |
} | ||
|
||
private def dispatch(): Unit = LiveListenerBus.withinListenerThread.withValue(true) { | ||
try { | ||
var next: SparkListenerEvent = eventQueue.take() | ||
while (next != POISON_PILL) { | ||
val ctx = processingTime.time() | ||
try { | ||
super.postToAll(next) | ||
} finally { | ||
ctx.stop() | ||
} | ||
eventCount.decrementAndGet() | ||
next = eventQueue.take() | ||
var next: SparkListenerEvent = eventQueue.take() | ||
while (next != POISON_PILL) { | ||
val ctx = processingTime.time() | ||
try { | ||
super.postToAll(next) | ||
} finally { | ||
ctx.stop() | ||
} | ||
eventCount.decrementAndGet() | ||
} catch { | ||
case ie: InterruptedException => | ||
logInfo(s"Stopping listener queue $name.", ie) | ||
next = eventQueue.take() | ||
} | ||
eventCount.decrementAndGet() | ||
} | ||
|
||
override protected def getTimer(listener: SparkListenerInterface): Option[Timer] = { | ||
|
@@ -130,7 +129,11 @@ private class AsyncEventQueue(val name: String, conf: SparkConf, metrics: LiveLi | |
eventCount.incrementAndGet() | ||
eventQueue.put(POISON_PILL) | ||
} | ||
dispatchThread.join() | ||
// this thread might be trying to stop itself as part of error handling -- we can't join | ||
// in that case. | ||
if (Thread.currentThread() != dispatchThread) { | ||
dispatchThread.join() | ||
} | ||
} | ||
|
||
def post(event: SparkListenerEvent): Unit = { | ||
|
@@ -187,6 +190,12 @@ private class AsyncEventQueue(val name: String, conf: SparkConf, metrics: LiveLi | |
true | ||
} | ||
|
||
override def removeListenerOnError(listener: SparkListenerInterface): Unit = { | ||
// the listener failed in an unrecoverably way, we want to remove it from the entire | ||
// LiveListenerBus (potentially stopping a queue if its empty) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's |
||
bus.removeListener(listener) | ||
} | ||
|
||
} | ||
|
||
private object AsyncEventQueue { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -102,7 +102,7 @@ private[spark] class LiveListenerBus(conf: SparkConf) { | |
queue.addListener(listener) | ||
|
||
case None => | ||
val newQueue = new AsyncEventQueue(queue, conf, metrics) | ||
val newQueue = new AsyncEventQueue(queue, conf, metrics, this) | ||
newQueue.addListener(listener) | ||
if (started.get()) { | ||
newQueue.start(sparkContext) | ||
|
@@ -111,6 +111,12 @@ private[spark] class LiveListenerBus(conf: SparkConf) { | |
} | ||
} | ||
|
||
private[scheduler] def removeQueue(queue: String): Unit = synchronized { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not used anymore. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we remove it? |
||
queues.asScala.find(_.name == queue).foreach { q => | ||
queues.remove(q) | ||
} | ||
} | ||
|
||
def removeListener(listener: SparkListenerInterface): Unit = synchronized { | ||
// Remove listener from all queues it was added to, and stop queues that have become empty. | ||
queues.asScala | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -60,6 +60,15 @@ private[spark] trait ListenerBus[L <: AnyRef, E] extends Logging { | |
} | ||
} | ||
|
||
/** | ||
* This can be overriden by subclasses if there is any extra cleanup to do when removing a | ||
* listener. In particular AsyncEventQueues can clean up queues in the LiveListenerBus. | ||
*/ | ||
def removeListenerOnError(listener: L): Unit = { | ||
removeListener(listener) | ||
} | ||
|
||
|
||
/** | ||
* Post the event to all registered listeners. The `postToAll` caller should guarantee calling | ||
* `postToAll` in the same thread for all events. | ||
|
@@ -80,6 +89,11 @@ private[spark] trait ListenerBus[L <: AnyRef, E] extends Logging { | |
} | ||
try { | ||
doPostEvent(listener, event) | ||
if (Thread.interrupted()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is ok right now since Spark code never explicitly interrupts these threads. If we ever need to do that, though, this might become a problem... but in that case I don't know how you'd handle this issue without just giving up and stopping everything. But... is this correct? Your test just calls
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah agree, I will handle this too -- but I'll wait to update till there is agreement on the right overall approach There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
If spark were to explicitly interrupt, then I think we'd also set some other flag indicating a reason, eg. I've pushed an update to handle |
||
logError(s"Interrupted while posting to ${Utils.getFormattedClassName(listener)}. " + | ||
s"Removing that listener.") | ||
removeListenerOnError(listener) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if the thread state is interrupted, is it still safe to keep this queue running? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could just throw There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 |
||
} | ||
} catch { | ||
case NonFatal(e) if !isIgnorableException(e) => | ||
logError(s"Listener ${Utils.getFormattedClassName(listener)} threw an exception", e) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it's ok to leave this, but this doesn't happen anymore, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It does still happen, we need this. We see the interrupt in postToAll, which is in the queue thread. If it fails, we call
removeListenerOnError
. If that results in the queue being empty, we stop the queue.