From 0fbd628b9b8817fd1b71faca92d87c56213d79f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B7=98=E6=B1=9F?= Date: Fri, 13 Jan 2017 16:41:37 +0800 Subject: [PATCH] [FLINK-4912] Introduce RECONCILIATING state in ExecutionGraph and Execution for JobManager failure recovery --- .../runtime/execution/ExecutionState.java | 24 ++++++++++++------- .../flink/runtime/jobgraph/JobStatus.java | 5 +++- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/execution/ExecutionState.java b/flink-runtime/src/main/java/org/apache/flink/runtime/execution/ExecutionState.java index e3e32560f3a8a..d6ff0cdd94af3 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/execution/ExecutionState.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/execution/ExecutionState.java @@ -25,16 +25,23 @@ *
{@code
  *
  *     CREATED  -> SCHEDULED -> DEPLOYING -> RUNNING -> FINISHED
- *                     |            |          |
- *                     |            |   +------+
- *                     |            V   V
- *                     |         CANCELLING -----+----> CANCELED
- *                     |                         |
- *                     +-------------------------+
+ *            |         |            |          |
+ *            |         |            |   +------+
+ *            |         |            V   V
+ *            |         |         CANCELLING -----+----> CANCELED
+ *            |         |                         |
+ *            |        +-------------------------+
+ *            |
+ *            |                                   ... -> FAILED
+ *           V
+ *    RECONCILING  -> RUNNING | FINISHED | CANCELED | FAILED
  *
- *                                               ... -> FAILED
  * }
* + *

It is possible to enter the {@code RECONCILING} state from {@code CREATED} + * state if job manager fail over, and the {@code RECONCILING} state can switch into + * any existing task state.

+ * *

It is possible to enter the {@code FAILED} state from any other state.

* *

The states {@code FINISHED}, {@code CANCELED}, and {@code FAILED} are @@ -56,8 +63,9 @@ public enum ExecutionState { CANCELED, - FAILED; + FAILED, + RECONCILING; public boolean isTerminal() { return this == FINISHED || this == CANCELED || this == FAILED; diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/jobgraph/JobStatus.java b/flink-runtime/src/main/java/org/apache/flink/runtime/jobgraph/JobStatus.java index 236a217768013..6a0ac9777f790 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/jobgraph/JobStatus.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/jobgraph/JobStatus.java @@ -51,7 +51,10 @@ public enum JobStatus { * The job has been suspended which means that it has been stopped but not been removed from a * potential HA job store. */ - SUSPENDED(TerminalState.LOCALLY); + SUSPENDED(TerminalState.LOCALLY), + + /** The job is currently reconciling and waits for task execution report to recover state. */ + RECONCILING(TerminalState.NON_TERMINAL); // --------------------------------------------------------------------------------------------