-
Notifications
You must be signed in to change notification settings - Fork 28k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-5945] Spark should not retry a stage infinitely on a FetchFailedException #5636
Changes from all commits
40aefbe
f8744be
8fe31e0
e0f8b55
729b7ef
d5fa622
0335b96
2b91940
914b2cb
77555b9
1243b65
9052e39
673fcb2
e26ae6e
75952ea
bc88aa1
7ff8b21
560a381
76f226a
fe647d0
ddfe46c
ee8d52e
4da3d5d
5e13342
34d69fa
e101ed7
daad2e4
0c054d3
f23c31b
9978575
b66d74e
2e058ba
f79011b
62532fa
1c1cb72
17e85de
cf94850
7c6f60f
13af970
09929da
eb15503
01d6841
1dd4840
4da18a1
5e4fe99
f928ff3
e22ce7c
2bd4138
1d44e0c
5bb1ae6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -46,7 +46,7 @@ import org.apache.spark.util.CallSite | |
* be updated for each attempt. | ||
* | ||
*/ | ||
private[spark] abstract class Stage( | ||
private[scheduler] abstract class Stage( | ||
val id: Int, | ||
val rdd: RDD[_], | ||
val numTasks: Int, | ||
|
@@ -92,6 +92,29 @@ private[spark] abstract class Stage( | |
*/ | ||
private var _latestInfo: StageInfo = StageInfo.fromStage(this, nextAttemptId) | ||
|
||
/** | ||
* Set of stage attempt IDs that have failed with a FetchFailure. We keep track of these | ||
* failures in order to avoid endless retries if a stage keeps failing with a FetchFailure. | ||
* We keep track of each attempt ID that has failed to avoid recording duplicate failures if | ||
* multiple tasks from the same stage attempt fail (SPARK-5945). | ||
*/ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This comment is super verbose, and after reading it, it's still not obvious what this is actually storing. What about something like: Set of IDs of stage attempts that have failed with a FetchFailure. We keep track of these failures in order to avoid endless retries if a stage keeps failing with a FetchFailure. We keep track of each attempt ID that has failed to avoid recording duplicate failures if multiple tasks from the same stage attempt fail. |
||
private val fetchFailedAttemptIds = new HashSet[Int] | ||
|
||
private[scheduler] def clearFailures() : Unit = { | ||
fetchFailedAttemptIds.clear() | ||
} | ||
|
||
/** | ||
* Check whether we should abort the failedStage due to multiple consecutive fetch failures. | ||
* | ||
* This method updates the running set of failed stage attempts and returns | ||
* true if the number of failures exceeds the allowable number of failures. | ||
*/ | ||
private[scheduler] def failedOnFetchAndShouldAbort(stageAttemptId: Int): Boolean = { | ||
fetchFailedAttemptIds.add(stageAttemptId) | ||
fetchFailedAttemptIds.size >= Stage.MAX_CONSECUTIVE_FETCH_FAILURES | ||
} | ||
|
||
/** Creates a new attempt for this stage by creating a new StageInfo with a new attempt ID. */ | ||
def makeNewStageAttempt( | ||
numPartitionsToCompute: Int, | ||
|
@@ -110,3 +133,8 @@ private[spark] abstract class Stage( | |
case _ => false | ||
} | ||
} | ||
|
||
private[scheduler] object Stage { | ||
// The number of consecutive failures allowed before a stage is aborted | ||
val MAX_CONSECUTIVE_FETCH_FAILURES = 4 | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,11 +26,11 @@ import org.scalatest.concurrent.Timeouts | |
import org.scalatest.time.SpanSugar._ | ||
|
||
import org.apache.spark._ | ||
import org.apache.spark.executor.TaskMetrics | ||
import org.apache.spark.rdd.RDD | ||
import org.apache.spark.scheduler.SchedulingMode.SchedulingMode | ||
import org.apache.spark.storage.{BlockId, BlockManagerId, BlockManagerMaster} | ||
import org.apache.spark.util.CallSite | ||
import org.apache.spark.executor.TaskMetrics | ||
|
||
class DAGSchedulerEventProcessLoopTester(dagScheduler: DAGScheduler) | ||
extends DAGSchedulerEventProcessLoop(dagScheduler) { | ||
|
@@ -473,6 +473,282 @@ class DAGSchedulerSuite | |
assertDataStructuresEmpty() | ||
} | ||
|
||
|
||
// Helper function to validate state when creating tests for task failures | ||
private def checkStageId(stageId: Int, attempt: Int, stageAttempt: TaskSet) { | ||
assert(stageAttempt.stageId === stageId) | ||
assert(stageAttempt.stageAttemptId == attempt) | ||
} | ||
|
||
|
||
// Helper functions to extract commonly used code in Fetch Failure test cases | ||
private def setupStageAbortTest(sc: SparkContext) { | ||
sc.listenerBus.addListener(new EndListener()) | ||
ended = false | ||
jobResult = null | ||
} | ||
|
||
// Create a new Listener to confirm that the listenerBus sees the JobEnd message | ||
// when we abort the stage. This message will also be consumed by the EventLoggingListener | ||
// so this will propagate up to the user. | ||
var ended = false | ||
var jobResult : JobResult = null | ||
|
||
class EndListener extends SparkListener { | ||
override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = { | ||
jobResult = jobEnd.jobResult | ||
ended = true | ||
} | ||
} | ||
|
||
/** | ||
* Common code to get the next stage attempt, confirm it's the one we expect, and complete it | ||
* successfully. | ||
* | ||
* @param stageId - The current stageId | ||
* @param attemptIdx - The current attempt count | ||
* @param numShufflePartitions - The number of partitions in the next stage | ||
*/ | ||
private def completeShuffleMapStageSuccessfully( | ||
stageId: Int, | ||
attemptIdx: Int, | ||
numShufflePartitions: Int): Unit = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: multiline format, each arg on its own line def completeNextShuffleMapSuccessfully(
stageId: Int,
attemptIdx: Int,
numShufflePartitions: Int): Unit = { There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo: Successfully |
||
val stageAttempt = taskSets.last | ||
checkStageId(stageId, attemptIdx, stageAttempt) | ||
complete(stageAttempt, stageAttempt.tasks.zipWithIndex.map { | ||
case (task, idx) => | ||
(Success, makeMapStatus("host" + ('A' + idx).toChar, numShufflePartitions)) | ||
}.toSeq) | ||
} | ||
|
||
/** | ||
* Common code to get the next stage attempt, confirm it's the one we expect, and complete it | ||
* with all FetchFailure. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason all of the tasks need to end in Failure? Can you just have one task end in a failure? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is primarily for convenience when automatically generating failing tasks. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. to expand on this slightly -- the one case where do want to have more than one fetch failure is "Multiple tasks w/ fetch failures in same stage attempt should not abort the stage". I know the other cases could just have one fetch failure, but it seems they are still testing the right thing with all fetch failures, and this way we can reuse this method. Of course there are no end of different variants we could add tests for, but it seems to me this is a reasonably good balance. I suppose we could also add an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I was just thinking that with this method, the "Multiple tasks w/ fetch failures..." test basically adds nothing, because it's just a subset of the functionality in the other tests (and so a failure in that test won't be that useful, because all of the tests will fail). A consequence is that there's no way to distinguish (in the test results) between a bug when there's a single task failure, and a bug that only manifests when multiple tasks in a stage fails. But this isn't the end of the world and if it's too hard to fix, seems OK to leave as-is (we should just re-order the tests so the unit test that's a subset of the others comes first). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, agree that as is, that test isn't really adding anything over the other tests as you've noted. I certainly don't think I'd say "too hard to fix" -- I suppose its just my antsy-ness to get this in, but objectively, it probably makes sense to fix. all you are really asking is to change How about this: wait a day for @ilganeli to update, and if he doesn't get to it we merge as-is and I do a simple follow-up pr? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Imran – I don’t have cycles to do a significant refactor at the moment. I would suggest we merge and follow up later. From: Imran Rashid <notifications@github.commailto:notifications@github.com> In core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scalahttps://github.com//pull/5636#discussion_r38566341:
yeah, agree that as is, that test isn't really adding anything over the other tests as you've noted. I certainly don't think I'd say "too hard to fix" -- I suppose its just my antsy-ness to get this in, but objectively, it probably makes sense to fix. all you are really asking is to change completeNextStageWithFetchFailure to oneFetchFailureInNextStage and change "Multiple tasks w/ fetch failures..." to just directly do what this method is doing now, pretty minor change. How about this: wait a day for @ilganelihttps://github.com/ilganeli to update, and if he doesn't get to it we merge as-is and I do a simple follow-up pr? — The information contained in this e-mail is confidential and/or proprietary to Capital One and/or its affiliates and may only be used solely in performance of work or services for Capital One. The information transmitted herewith is intended only for use by the individual or entity to which it is addressed. If the reader of this message is not the intended recipient, you are hereby notified that any review, retransmission, dissemination, distribution, copying or other use of, or taking of any action in reliance upon this information is strictly prohibited. If you have received this communication in error, please contact the sender and delete the material from your computer. |
||
* | ||
* @param stageId - The current stageId | ||
* @param attemptIdx - The current attempt count | ||
* @param shuffleDep - The shuffle dependency of the stage with a fetch failure | ||
*/ | ||
private def completeNextStageWithFetchFailure( | ||
stageId: Int, | ||
attemptIdx: Int, | ||
shuffleDep: ShuffleDependency[_, _, _]): Unit = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: multiline format |
||
val stageAttempt = taskSets.last | ||
checkStageId(stageId, attemptIdx, stageAttempt) | ||
complete(stageAttempt, stageAttempt.tasks.zipWithIndex.map { case (task, idx) => | ||
(FetchFailed(makeBlockManagerId("hostA"), shuffleDep.shuffleId, 0, idx, "ignored"), null) | ||
}.toSeq) | ||
} | ||
|
||
/** | ||
* Common code to get the next result stage attempt, confirm it's the one we expect, and | ||
* complete it with a success where we return 42. | ||
* | ||
* @param stageId - The current stageId | ||
* @param attemptIdx - The current attempt count | ||
*/ | ||
private def completeNextResultStageWithSuccess(stageId: Int, attemptIdx: Int): Unit = { | ||
val stageAttempt = taskSets.last | ||
checkStageId(stageId, attemptIdx, stageAttempt) | ||
assert(scheduler.stageIdToStage(stageId).isInstanceOf[ResultStage]) | ||
complete(stageAttempt, stageAttempt.tasks.zipWithIndex.map(_ => (Success, 42)).toSeq) | ||
} | ||
|
||
/** | ||
* In this test, we simulate a job where many tasks in the same stage fail. We want to show | ||
* that many fetch failures inside a single stage attempt do not trigger an abort | ||
* on their own, but only when there are enough failing stage attempts. | ||
*/ | ||
test("Single fetch failure should not abort the stage.") { | ||
setupStageAbortTest(sc) | ||
|
||
val parts = 8 | ||
val shuffleMapRdd = new MyRDD(sc, parts, Nil) | ||
val shuffleDep = new ShuffleDependency(shuffleMapRdd, null) | ||
val shuffleId = shuffleDep.shuffleId | ||
val reduceRdd = new MyRDD(sc, parts, List(shuffleDep)) | ||
submit(reduceRdd, (0 until parts).toArray) | ||
|
||
completeShuffleMapStageSuccessfully(0, 0, numShufflePartitions = parts) | ||
|
||
completeNextStageWithFetchFailure(1, 0, shuffleDep) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wait I think things got a little confused between all the comments from Kay, Andrew, and me ... Maybe the name should actually be "multiple task with fetch failures in a single stage attempt should not abort the stage"? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What? Why does it matter if there are one vs multiple tasks that failed with the fetch failure? Your suggestion is very verbose... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Was your concern that "Single fetch failure" could refer to a task? If so we can call this "Single stage fetch failure" |
||
|
||
// Resubmit and confirm that now all is well | ||
scheduler.resubmitFailedStages() | ||
|
||
assert(scheduler.runningStages.nonEmpty) | ||
assert(!ended) | ||
|
||
// Complete stage 0 and then stage 1 with a "42" | ||
completeShuffleMapStageSuccessfully(0, 1, numShufflePartitions = parts) | ||
completeNextResultStageWithSuccess(1, 1) | ||
|
||
// Confirm job finished succesfully | ||
sc.listenerBus.waitUntilEmpty(1000) | ||
assert(ended === true) | ||
assert(results === (0 until parts).map { idx => idx -> 42 }.toMap) | ||
assertDataStructuresEmpty() | ||
} | ||
|
||
/** | ||
* In this test we simulate a job failure where the first stage completes successfully and | ||
* the second stage fails due to a fetch failure. Multiple successive fetch failures of a stage | ||
* trigger an overall job abort to avoid endless retries. | ||
*/ | ||
test("Multiple consecutive stage fetch failures should lead to job being aborted.") { | ||
setupStageAbortTest(sc) | ||
|
||
val shuffleMapRdd = new MyRDD(sc, 2, Nil) | ||
val shuffleDep = new ShuffleDependency(shuffleMapRdd, null) | ||
val shuffleId = shuffleDep.shuffleId | ||
val reduceRdd = new MyRDD(sc, 2, List(shuffleDep)) | ||
submit(reduceRdd, Array(0, 1)) | ||
|
||
for (attempt <- 0 until Stage.MAX_CONSECUTIVE_FETCH_FAILURES) { | ||
// Complete all the tasks for the current attempt of stage 0 successfully | ||
completeShuffleMapStageSuccessfully(0, attempt, numShufflePartitions = 2) | ||
|
||
// Now we should have a new taskSet, for a new attempt of stage 1. | ||
// Fail all these tasks with FetchFailure | ||
completeNextStageWithFetchFailure(1, attempt, shuffleDep) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here, all tasks, not one task |
||
|
||
// this will trigger a resubmission of stage 0, since we've lost some of its | ||
// map output, for the next iteration through the loop | ||
scheduler.resubmitFailedStages() | ||
|
||
if (attempt < Stage.MAX_CONSECUTIVE_FETCH_FAILURES - 1) { | ||
assert(scheduler.runningStages.nonEmpty) | ||
assert(!ended) | ||
} else { | ||
// Stage should have been aborted and removed from running stages | ||
assertDataStructuresEmpty() | ||
sc.listenerBus.waitUntilEmpty(1000) | ||
assert(ended) | ||
jobResult match { | ||
case JobFailed(reason) => | ||
assert(reason.getMessage.contains("ResultStage 1 () has failed the maximum")) | ||
case other => fail(s"expected JobFailed, not $other") | ||
} | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* In this test, we create a job with two consecutive shuffles, and simulate 2 failures for each | ||
* shuffle fetch. In total In total, the job has had four failures overall but not four failures | ||
* for a particular stage, and as such should not be aborted. | ||
*/ | ||
test("Failures in different stages should not trigger an overall abort") { | ||
setupStageAbortTest(sc) | ||
|
||
val shuffleOneRdd = new MyRDD(sc, 2, Nil).cache() | ||
val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, null) | ||
val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne)).cache() | ||
val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, null) | ||
val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo)) | ||
submit(finalRdd, Array(0)) | ||
|
||
// In the first two iterations, Stage 0 succeeds and stage 1 fails. In the next two iterations, | ||
// stage 2 fails. | ||
for (attempt <- 0 until Stage.MAX_CONSECUTIVE_FETCH_FAILURES) { | ||
// Complete all the tasks for the current attempt of stage 0 successfully | ||
completeShuffleMapStageSuccessfully(0, attempt, numShufflePartitions = 2) | ||
|
||
if (attempt < Stage.MAX_CONSECUTIVE_FETCH_FAILURES / 2) { | ||
// Now we should have a new taskSet, for a new attempt of stage 1. | ||
// Fail all these tasks with FetchFailure | ||
completeNextStageWithFetchFailure(1, attempt, shuffleDepOne) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and here |
||
} else { | ||
completeShuffleMapStageSuccessfully(1, attempt, numShufflePartitions = 1) | ||
|
||
// Fail stage 2 | ||
completeNextStageWithFetchFailure(2, attempt - Stage.MAX_CONSECUTIVE_FETCH_FAILURES / 2, | ||
shuffleDepTwo) | ||
} | ||
|
||
// this will trigger a resubmission of stage 0, since we've lost some of its | ||
// map output, for the next iteration through the loop | ||
scheduler.resubmitFailedStages() | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you also successfully complete stage 0 & stage 1 at this point? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you mean to rerun stage0 and stage1 again? What would that show? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just want to make sure that we can actually finish the job successfully -- you could imagine your checks all passing, but the job is stuck in some weird state where the DAGSCheduler thinks the stage is still running, but no task sets actually created, or something weird like that. Your asserts are almost there, but not quite. the final call to I know this is minor, I'd just like to be extra thorough. |
||
|
||
completeShuffleMapStageSuccessfully(0, 4, numShufflePartitions = 2) | ||
completeShuffleMapStageSuccessfully(1, 4, numShufflePartitions = 1) | ||
|
||
// Succeed stage2 with a "42" | ||
completeNextResultStageWithSuccess(2, Stage.MAX_CONSECUTIVE_FETCH_FAILURES/2) | ||
|
||
assert(results === Map(0 -> 42)) | ||
assertDataStructuresEmpty() | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
/** | ||
* In this test we demonstrate that only consecutive failures trigger a stage abort. A stage may | ||
* fail multiple times, succeed, then fail a few more times (because its run again by downstream | ||
* dependencies). The total number of failed attempts for one stage will go over the limit, | ||
* but that doesn't matter, since they have successes in the middle. | ||
*/ | ||
test("Non-consecutive stage failures don't trigger abort") { | ||
setupStageAbortTest(sc) | ||
|
||
val shuffleOneRdd = new MyRDD(sc, 2, Nil).cache() | ||
val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, null) | ||
val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne)).cache() | ||
val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, null) | ||
val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo)) | ||
submit(finalRdd, Array(0)) | ||
|
||
// First, execute stages 0 and 1, failing stage 1 up to MAX-1 times. | ||
for (attempt <- 0 until Stage.MAX_CONSECUTIVE_FETCH_FAILURES - 1) { | ||
// Make each task in stage 0 success | ||
completeShuffleMapStageSuccessfully(0, attempt, numShufflePartitions = 2) | ||
|
||
// Now we should have a new taskSet, for a new attempt of stage 1. | ||
// Fail these tasks with FetchFailure | ||
completeNextStageWithFetchFailure(1, attempt, shuffleDepOne) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we've changed the behavior a bit here in the refactor, so the comment is no longer accurate -- now we fail all tasks w/ fetch failures. |
||
|
||
scheduler.resubmitFailedStages() | ||
|
||
// Confirm we have not yet aborted | ||
assert(scheduler.runningStages.nonEmpty) | ||
assert(!ended) | ||
} | ||
|
||
// Rerun stage 0 and 1 to step through the task set | ||
completeShuffleMapStageSuccessfully(0, 3, numShufflePartitions = 2) | ||
completeShuffleMapStageSuccessfully(1, 3, numShufflePartitions = 1) | ||
|
||
// Fail stage 2 so that stage 1 is resubmitted when we call scheduler.resubmitFailedStages() | ||
completeNextStageWithFetchFailure(2, 0, shuffleDepTwo) | ||
|
||
scheduler.resubmitFailedStages() | ||
|
||
// Rerun stage 0 to step through the task set | ||
completeShuffleMapStageSuccessfully(0, 4, numShufflePartitions = 2) | ||
|
||
// Now again, fail stage 1 (up to MAX_FAILURES) but confirm that this doesn't trigger an abort | ||
// since we succeeded in between. | ||
completeNextStageWithFetchFailure(1, 4, shuffleDepOne) | ||
|
||
scheduler.resubmitFailedStages() | ||
|
||
// Confirm we have not yet aborted | ||
assert(scheduler.runningStages.nonEmpty) | ||
assert(!ended) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm confused by this test -- it has fetch failures, which will result in a stage retry, but the test description says the point is to test non-fetch failures There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the comments @squito. I'll pick this up once the other patch is merged. |
||
|
||
// Next, succeed all and confirm output | ||
// Rerun stage 0 + 1 | ||
completeShuffleMapStageSuccessfully(0, 5, numShufflePartitions = 2) | ||
completeShuffleMapStageSuccessfully(1, 5, numShufflePartitions = 1) | ||
|
||
// Succeed stage 2 and verify results | ||
completeNextResultStageWithSuccess(2, 1) | ||
|
||
assertDataStructuresEmpty() | ||
sc.listenerBus.waitUntilEmpty(1000) | ||
assert(ended === true) | ||
assert(results === Map(0 -> 42)) | ||
} | ||
|
||
test("trivial shuffle with multiple fetch failures") { | ||
val shuffleMapRdd = new MyRDD(sc, 2, Nil) | ||
val shuffleDep = new ShuffleDependency(shuffleMapRdd, null) | ||
|
@@ -810,15 +1086,15 @@ class DAGSchedulerSuite | |
submit(finalRdd, Array(0)) | ||
cacheLocations(shuffleTwoRdd.id -> 0) = Seq(makeBlockManagerId("hostD")) | ||
cacheLocations(shuffleTwoRdd.id -> 1) = Seq(makeBlockManagerId("hostC")) | ||
// complete stage 2 | ||
// complete stage 0 | ||
complete(taskSets(0), Seq( | ||
(Success, makeMapStatus("hostA", 2)), | ||
(Success, makeMapStatus("hostB", 2)))) | ||
// complete stage 1 | ||
complete(taskSets(1), Seq( | ||
(Success, makeMapStatus("hostA", 1)), | ||
(Success, makeMapStatus("hostB", 1)))) | ||
// pretend stage 0 failed because hostA went down | ||
// pretend stage 2 failed because hostA went down | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a test comment for spark-prs integration; please disregard There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. update to test comment; again, please disregard |
||
complete(taskSets(2), Seq( | ||
(FetchFailed(makeBlockManagerId("hostA"), shuffleDepTwo.shuffleId, 0, 0, "ignored"), null))) | ||
// TODO assert this: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks correct, but just so I understand, we need to do this in case this stage is resubmitted in the future for a different reason?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My understanding is that this is because the stage may need to be re-run later if one of the machines where its output is stored fails, so the output needs to be re-constructed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, that falls under "different reason", i.e. fetch failure in a different stage that depends on this one. That makes sense.