apache · StefanRRichter · Jun 19, 2019 · Apr 30, 2019 · May 26, 2019 · May 27, 2019
diff --git a/flink-end-to-end-tests/test-scripts/common.sh b/flink-end-to-end-tests/test-scripts/common.sh
@@ -346,7 +346,7 @@ function check_logs_for_exceptions {
    | grep -v "java.io.InvalidClassException: org.apache.flink.formats.avro.typeutils.AvroSerializer" \
    | grep -v "Caused by: java.lang.Exception: JobManager is shutting down" \
    | grep -v "java.lang.Exception: Artificial failure" \
-   | grep -v "org.apache.flink.runtime.checkpoint.decline" \
+   | grep -v "org.apache.flink.runtime.checkpoint.CheckpointException" \
    | grep -v "org.elasticsearch.ElasticsearchException" \
    | grep -v "Elasticsearch exception" \
    | grep -ic "exception" || true)

diff --git a/...etrics-jmx/src/test/java/org/apache/flink/runtime/jobmanager/JMXJobManagerMetricTest.java b/...etrics-jmx/src/test/java/org/apache/flink/runtime/jobmanager/JMXJobManagerMetricTest.java
@@ -101,7 +101,8 @@ public void testJobManagerJMXMetricAccess() throws Exception {
 					5,
 					CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
 					true,
-					false),
+					false,
+					0),
 				null));
 
 			ClusterClient<?> client = MINI_CLUSTER_RESOURCE.getClusterClient();

diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/checkpoint/CheckpointCoordinator.java b/flink-runtime/src/main/java/org/apache/flink/runtime/checkpoint/CheckpointCoordinator.java
@@ -21,7 +21,6 @@
 import org.apache.flink.annotation.VisibleForTesting;
 import org.apache.flink.api.common.JobID;
 import org.apache.flink.api.common.time.Time;
-import org.apache.flink.runtime.checkpoint.decline.CheckpointDeclineException;
 import org.apache.flink.runtime.checkpoint.hooks.MasterHooks;
 import org.apache.flink.runtime.concurrent.FutureUtils;
 import org.apache.flink.runtime.execution.ExecutionState;
@@ -33,6 +32,7 @@
 import org.apache.flink.runtime.jobgraph.JobStatus;
 import org.apache.flink.runtime.jobgraph.JobVertexID;
 import org.apache.flink.runtime.jobgraph.OperatorID;
+import org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration;
 import org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint;
 import org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint;
 import org.apache.flink.runtime.state.CheckpointStorageCoordinatorView;
@@ -185,15 +185,13 @@ public class CheckpointCoordinator {
 
 	private boolean isPreferCheckpointForRecovery;
 
+	private final CheckpointFailureManager failureManager;
+
 	// --------------------------------------------------------------------------------------------
 
 	public CheckpointCoordinator(
 			JobID job,
-			long baseInterval,
-			long checkpointTimeout,
-			long minPauseBetweenCheckpoints,
-			int maxConcurrentCheckpointAttempts,
-			CheckpointRetentionPolicy retentionPolicy,
+			CheckpointCoordinatorConfiguration chkConfig,
 			ExecutionVertex[] tasksToTrigger,
 			ExecutionVertex[] tasksToWaitFor,
 			ExecutionVertex[] tasksToCommitTo,
@@ -202,31 +200,29 @@ public CheckpointCoordinator(
 			StateBackend checkpointStateBackend,
 			Executor executor,
 			SharedStateRegistryFactory sharedStateRegistryFactory,
-			boolean isPreferCheckpointForRecovery) {
+			CheckpointFailureManager failureManager) {
 
 		// sanity checks
 		checkNotNull(checkpointStateBackend);
-		checkArgument(baseInterval > 0, "Checkpoint base interval must be larger than zero");
-		checkArgument(checkpointTimeout >= 1, "Checkpoint timeout must be larger than zero");
-		checkArgument(minPauseBetweenCheckpoints >= 0, "minPauseBetweenCheckpoints must be >= 0");
-		checkArgument(maxConcurrentCheckpointAttempts >= 1, "maxConcurrentCheckpointAttempts must be >= 1");
 
 		// max "in between duration" can be one year - this is to prevent numeric overflows
+		long minPauseBetweenCheckpoints = chkConfig.getMinPauseBetweenCheckpoints();
 		if (minPauseBetweenCheckpoints > 365L * 24 * 60 * 60 * 1_000) {
 			minPauseBetweenCheckpoints = 365L * 24 * 60 * 60 * 1_000;
 		}
 
 		// it does not make sense to schedule checkpoints more often then the desired
 		// time between checkpoints
+		long baseInterval = chkConfig.getCheckpointInterval();
 		if (baseInterval < minPauseBetweenCheckpoints) {
 			baseInterval = minPauseBetweenCheckpoints;
 		}
 
 		this.job = checkNotNull(job);
 		this.baseInterval = baseInterval;
-		this.checkpointTimeout = checkpointTimeout;
+		this.checkpointTimeout = chkConfig.getCheckpointTimeout();
 		this.minPauseBetweenCheckpointsNanos = minPauseBetweenCheckpoints * 1_000_000;
-		this.maxConcurrentCheckpointAttempts = maxConcurrentCheckpointAttempts;
+		this.maxConcurrentCheckpointAttempts = chkConfig.getMaxConcurrentCheckpoints();
 		this.tasksToTrigger = checkNotNull(tasksToTrigger);
 		this.tasksToWaitFor = checkNotNull(tasksToWaitFor);
 		this.tasksToCommitTo = checkNotNull(tasksToCommitTo);
@@ -236,7 +232,8 @@ public CheckpointCoordinator(
 		this.executor = checkNotNull(executor);
 		this.sharedStateRegistryFactory = checkNotNull(sharedStateRegistryFactory);
 		this.sharedStateRegistry = sharedStateRegistryFactory.create(executor);
-		this.isPreferCheckpointForRecovery = isPreferCheckpointForRecovery;
+		this.isPreferCheckpointForRecovery = chkConfig.isPreferCheckpointForRecovery();
+		this.failureManager = checkNotNull(failureManager);
 
 		this.recentPendingCheckpoints = new ArrayDeque<>(NUM_GHOST_CHECKPOINT_IDS);
 		this.masterHooks = new HashMap<>();
@@ -249,7 +246,7 @@ public CheckpointCoordinator(
 		this.timer.setContinueExistingPeriodicTasksAfterShutdownPolicy(false);
 		this.timer.setExecuteExistingDelayedTasksAfterShutdownPolicy(false);
 
-		this.checkpointProperties = CheckpointProperties.forCheckpoint(retentionPolicy);
+		this.checkpointProperties = CheckpointProperties.forCheckpoint(chkConfig.getCheckpointRetentionPolicy());
 
 		try {
 			this.checkpointStorage = checkpointStateBackend.createCheckpointStorage(job);
@@ -342,7 +339,7 @@ public void shutdown(JobStatus jobStatus) throws Exception {
 
 				// clear and discard all pending checkpoints
 				for (PendingCheckpoint pending : pendingCheckpoints.values()) {
-					pending.abort(CheckpointFailureReason.CHECKPOINT_COORDINATOR_SHUTDOWN);
+					failPendingCheckpoint(pending, CheckpointFailureReason.CHECKPOINT_COORDINATOR_SHUTDOWN);
 				}
 				pendingCheckpoints.clear();
 
@@ -439,6 +436,10 @@ public boolean triggerCheckpoint(long timestamp, boolean isPeriodic) {
 			triggerCheckpoint(timestamp, checkpointProperties, null, isPeriodic, false);
 			return true;
 		} catch (CheckpointException e) {
+			long latestGeneratedCheckpointId = getCheckpointIdCounter().get();
+			// here we can not get the failed pending checkpoint's id,
+			// so we pass the negative latest generated checkpoint id as a special flag
+			failureManager.handleCheckpointException(e, -1 * latestGeneratedCheckpointId);
 			return false;
 		}
 	}
@@ -459,7 +460,7 @@ public PendingCheckpoint triggerCheckpoint(
 		synchronized (lock) {
 			// abort if the coordinator has been shutdown in the meantime
 			if (shutdown) {
-				throw new CheckpointException(CheckpointFailureReason.COORDINATOR_SHUTDOWN);
+				throw new CheckpointException(CheckpointFailureReason.CHECKPOINT_COORDINATOR_SHUTDOWN);
 			}
 
 			// Don't allow periodic checkpoint if scheduling has been disabled
@@ -599,7 +600,7 @@ public PendingCheckpoint triggerCheckpoint(
 					if (!checkpoint.isDiscarded()) {
 						LOG.info("Checkpoint {} of job {} expired before completing.", checkpointID, job);
 
-						checkpoint.abort(CheckpointFailureReason.CHECKPOINT_EXPIRED);
+						failPendingCheckpoint(checkpoint, CheckpointFailureReason.CHECKPOINT_EXPIRED);
 						pendingCheckpoints.remove(checkpointID);
 						rememberRecentCheckpointId(checkpointID);
 
@@ -614,7 +615,7 @@ public PendingCheckpoint triggerCheckpoint(
 					// since we released the lock in the meantime, we need to re-check
 					// that the conditions still hold.
 					if (shutdown) {
-						throw new CheckpointException(CheckpointFailureReason.COORDINATOR_SHUTDOWN);
+						throw new CheckpointException(CheckpointFailureReason.CHECKPOINT_COORDINATOR_SHUTDOWN);
 					}
 					else if (!props.forceCheckpoint()) {
 						if (triggerRequestQueued) {
@@ -699,7 +700,7 @@ else if (!props.forceCheckpoint()) {
 						checkpointID, job, numUnsuccessful, t);
 
 				if (!checkpoint.isDiscarded()) {
-					checkpoint.abort(CheckpointFailureReason.TRIGGER_CHECKPOINT_FAILURE, t);
+					failPendingCheckpoint(checkpoint, CheckpointFailureReason.TRIGGER_CHECKPOINT_FAILURE, t);
 				}
 
 				try {
@@ -891,11 +892,12 @@ private void completePendingCheckpoint(PendingCheckpoint pendingCheckpoint) thro
 		try {
 			try {
 				completedCheckpoint = pendingCheckpoint.finalizeCheckpoint();
+				failureManager.handleCheckpointSuccess(pendingCheckpoint.getCheckpointId());
 			}
 			catch (Exception e1) {
 				// abort the current pending checkpoint if we fails to finalize the pending checkpoint.
 				if (!pendingCheckpoint.isDiscarded()) {
-					pendingCheckpoint.abort(CheckpointFailureReason.FINALIZE_CHECKPOINT_FAILURE, e1);
+					failPendingCheckpoint(pendingCheckpoint, CheckpointFailureReason.FINALIZE_CHECKPOINT_FAILURE, e1);
 				}
 
 				throw new CheckpointException("Could not finalize the pending checkpoint " + checkpointId + '.',
@@ -1002,7 +1004,7 @@ private void dropSubsumedCheckpoints(long checkpointId) {
 			// remove all pending checkpoints that are lesser than the current completed checkpoint
 			if (p.getCheckpointId() < checkpointId && p.canBeSubsumed()) {
 				rememberRecentCheckpointId(p.getCheckpointId());
-				p.abort(CheckpointFailureReason.CHECKPOINT_SUBSUMED);
+				failPendingCheckpoint(p, CheckpointFailureReason.CHECKPOINT_SUBSUMED);
 				entries.remove();
 			}
 		}
@@ -1275,7 +1277,7 @@ public void stopCheckpointScheduler() {
 	public void abortPendingCheckpoints(CheckpointException exception) {
 		synchronized (lock) {
 			for (PendingCheckpoint p : pendingCheckpoints.values()) {
-				p.abort(exception.getCheckpointFailureReason());
+				failPendingCheckpoint(p, exception.getCheckpointFailureReason());
 			}
 
 			pendingCheckpoints.clear();
@@ -1329,10 +1331,13 @@ private void discardCheckpoint(PendingCheckpoint pendingCheckpoint, @Nullable Th
 
 		LOG.info("Discarding checkpoint {} of job {}.", checkpointId, job, cause);
 
-		if (cause == null || cause instanceof CheckpointDeclineException) {
-			pendingCheckpoint.abort(CheckpointFailureReason.CHECKPOINT_DECLINED, cause);
+		if (cause == null) {
+			failPendingCheckpoint(pendingCheckpoint, CheckpointFailureReason.CHECKPOINT_DECLINED);
+		} else if (cause instanceof CheckpointException) {
+			CheckpointException exception = (CheckpointException) cause;
+			failPendingCheckpoint(pendingCheckpoint, exception.getCheckpointFailureReason(), cause);
 		} else {
-			pendingCheckpoint.abort(CheckpointFailureReason.JOB_FAILURE, cause);
+			failPendingCheckpoint(pendingCheckpoint, CheckpointFailureReason.JOB_FAILURE, cause);
 		}
 
 		rememberRecentCheckpointId(checkpointId);
@@ -1384,4 +1389,21 @@ public void run() {
 			});
 		}
 	}
+
+	private void failPendingCheckpoint(
+			final PendingCheckpoint pendingCheckpoint,
+			final CheckpointFailureReason reason,
+			final Throwable cause) {
+
+		CheckpointException exception = new CheckpointException(reason, cause);
+		pendingCheckpoint.abort(reason, cause);
+		failureManager.handleCheckpointException(exception, pendingCheckpoint.getCheckpointId());
+	}
+
+	private void failPendingCheckpoint(
+			final PendingCheckpoint pendingCheckpoint,
+			final CheckpointFailureReason reason) {
+
+		failPendingCheckpoint(pendingCheckpoint, reason, null);
+	}
 }
diff --git a/...k-runtime/src/main/java/org/apache/flink/runtime/checkpoint/CheckpointFailureManager.java b/...k-runtime/src/main/java/org/apache/flink/runtime/checkpoint/CheckpointFailureManager.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.runtime.checkpoint;
+
+import org.apache.flink.util.FlinkRuntimeException;
+
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import static org.apache.flink.util.Preconditions.checkArgument;
+import static org.apache.flink.util.Preconditions.checkNotNull;
+
+/**
+ * The checkpoint failure manager which centralized manage checkpoint failure processing logic.
+ */
+public class CheckpointFailureManager {
+
+	private final static int UNLIMITED_TOLERABLE_FAILURE_NUMBER = Integer.MAX_VALUE;
+
+	private final int tolerableCpFailureNumber;
+	private final FailJobCallback failureCallback;
+	private final AtomicInteger continuousFailureCounter;
+	private final Set<Long> countedCheckpointIds;
+
+	public CheckpointFailureManager(int tolerableCpFailureNumber, FailJobCallback failureCallback) {
+		checkArgument(tolerableCpFailureNumber >= 0,
+			"The tolerable checkpoint failure number is illegal, " +
+				"it must be greater than or equal to 0 .");
+		this.tolerableCpFailureNumber = tolerableCpFailureNumber;
+		this.continuousFailureCounter = new AtomicInteger(0);
+		this.failureCallback = checkNotNull(failureCallback);
+		this.countedCheckpointIds = ConcurrentHashMap.newKeySet();
+	}
+
+	/**
+	 * Handle checkpoint exception with a handler callback.
+	 *
+	 * @param exception the checkpoint exception.
+	 * @param checkpointId the failed checkpoint id used to count the continuous failure number based on
+	 *                     checkpoint id sequence. In trigger phase, we may not get the checkpoint id when the failure
+	 *                     happens before the checkpoint id generation. In this case, it will be specified a negative
+	 *                      latest generated checkpoint id as a special flag.
+	 */
+	public void handleCheckpointException(CheckpointException exception, long checkpointId) {
+		if (tolerableCpFailureNumber == UNLIMITED_TOLERABLE_FAILURE_NUMBER) {
+			return;
+		}
+
+		CheckpointFailureReason reason = exception.getCheckpointFailureReason();
+		switch (reason) {
+			case PERIODIC_SCHEDULER_SHUTDOWN:
+			case ALREADY_QUEUED:
+			case TOO_MANY_CONCURRENT_CHECKPOINTS:
+			case MINIMUM_TIME_BETWEEN_CHECKPOINTS:
+			case NOT_ALL_REQUIRED_TASKS_RUNNING:
+			case CHECKPOINT_SUBSUMED:
+			case CHECKPOINT_COORDINATOR_SUSPEND:
+			case CHECKPOINT_COORDINATOR_SHUTDOWN:
+			case JOB_FAILURE:
+			case JOB_FAILOVER_REGION:
+			//for compatibility purposes with user job behavior
+			case CHECKPOINT_DECLINED_TASK_NOT_READY:
+			case CHECKPOINT_DECLINED_TASK_NOT_CHECKPOINTING:
+			case CHECKPOINT_DECLINED_ALIGNMENT_LIMIT_EXCEEDED:
+			case CHECKPOINT_DECLINED_ON_CANCELLATION_BARRIER:
+			case CHECKPOINT_DECLINED_SUBSUMED:
+			case CHECKPOINT_DECLINED_INPUT_END_OF_STREAM:
+
+			case EXCEPTION:
+			case CHECKPOINT_EXPIRED:
+			case TASK_CHECKPOINT_FAILURE:
+			case TRIGGER_CHECKPOINT_FAILURE:
+			case FINALIZE_CHECKPOINT_FAILURE:
+				//ignore
+				break;
+
+			case CHECKPOINT_DECLINED:
+				//we should make sure one checkpoint only be counted once
+				if (countedCheckpointIds.add(checkpointId)) {
+					continuousFailureCounter.incrementAndGet();
+				}
+
+				break;
+
+			default:
+				throw new FlinkRuntimeException("Unknown checkpoint failure reason : " + reason.name());
+		}
+
+		if (continuousFailureCounter.get() > tolerableCpFailureNumber) {
+			clearCount();
+			failureCallback.failJob();
+		}
+	}
+
+	/**
+	 * Handle checkpoint success.
+	 *
+	 * @param checkpointId the failed checkpoint id used to count the continuous failure number based on
+	 *                     checkpoint id sequence.
+	 */
+	public void handleCheckpointSuccess(long checkpointId) {
+		clearCount();
+	}
+
+	private void clearCount() {
+		continuousFailureCounter.set(0);
+		countedCheckpointIds.clear();
+	}
+
+	/**
+	 * A callback interface about how to fail a job.
+	 */
+	public interface FailJobCallback {
+
+		void failJob();
+
+	}
+
+}
-Original file line number
+Diff line change
@@ Expand Up @@
 ,
     					CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
     					true,
-    					false),
+    					false,
+),
     				null));
     			ClusterClient<?> client = MINI_CLUSTER_RESOURCE.getClusterClient();
@@ Expand Down @@