From f9a1a9184ce16ee3e45a71549efbeabb323012e8 Mon Sep 17 00:00:00 2001 From: maghamravi Date: Fri, 17 Sep 2021 18:42:37 -0700 Subject: [PATCH 01/27] + using older aws sdk version --- flink-connectors/flink-connector-kinesis/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flink-connectors/flink-connector-kinesis/pom.xml b/flink-connectors/flink-connector-kinesis/pom.xml index ee66cd4a23046..c1259ce437876 100644 --- a/flink-connectors/flink-connector-kinesis/pom.xml +++ b/flink-connectors/flink-connector-kinesis/pom.xml @@ -33,7 +33,7 @@ under the License. flink-connector-kinesis_${scala.binary.version} Flink : Connectors : Kinesis - 1.12.7 + 1.11.603 2.16.86 1.11.2 0.14.0 From f373a63f9c8fde22690538777feae2f4258629d1 Mon Sep 17 00:00:00 2001 From: ravimagham Date: Mon, 20 Jun 2022 13:37:03 -0700 Subject: [PATCH 02/27] [LYFT] [STRMCMP-1388] pyflink changes --- flink-python/dev/dev-requirements.txt | 4 ++-- flink-python/setup.cfg | 3 +++ flink-python/setup.py | 18 ++++++------------ 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/flink-python/dev/dev-requirements.txt b/flink-python/dev/dev-requirements.txt index 0c957556102c4..a07b20072c62e 100755 --- a/flink-python/dev/dev-requirements.txt +++ b/flink-python/dev/dev-requirements.txt @@ -14,5 +14,5 @@ # limitations under the License. setuptools>=18.0 wheel -apache-beam==2.27.0 -cython==0.29.16 +apache-beam==2.30.0+lyft202205161652748117 +cython==0.28.1 diff --git a/flink-python/setup.cfg b/flink-python/setup.cfg index 138d2bd2c1505..f51e391997e51 100644 --- a/flink-python/setup.cfg +++ b/flink-python/setup.cfg @@ -21,3 +21,6 @@ universal = 1 [metadata] description-file = README.md + +[options] +python_requires= >=3.8 diff --git a/flink-python/setup.py b/flink-python/setup.py index efa6ffa1c5040..b2aad05fd4288 100644 --- a/flink-python/setup.py +++ b/flink-python/setup.py @@ -179,6 +179,8 @@ def extracted_output_files(base_dir, file_path, output_directory): file=sys.stderr) sys.exit(-1) VERSION = __version__ # noqa +APACHE_FLINK_VERSION = '1.13.0' + with io.open(os.path.join(this_directory, 'README.md'), 'r', encoding='utf-8') as f: long_description = f.read() @@ -252,15 +254,7 @@ def extracted_output_files(base_dir, file_path, output_directory): "is complete, or do this in the flink-python directory of the flink source " "directory.") sys.exit(-1) - if VERSION.find('dev0') != -1: - apache_flink_libraries_dependency = 'apache-flink-libraries==%s' % VERSION - else: - split_versions = VERSION.split('.') - split_versions[-1] = str(int(split_versions[-1]) + 1) - NEXT_VERSION = '.'.join(split_versions) - apache_flink_libraries_dependency = 'apache-flink-libraries>=%s,<%s' % \ - (VERSION, NEXT_VERSION) - + apache_flink_libraries_dependency = 'apache-flink-libraries==%s' % APACHE_FLINK_VERSION script_names = ["pyflink-shell.sh", "find-flink-home.sh"] scripts = [os.path.join(SCRIPTS_TEMP_PATH, script) for script in script_names] scripts.append("pyflink/find_flink_home.py") @@ -307,10 +301,10 @@ def extracted_output_files(base_dir, file_path, output_directory): license='https://www.apache.org/licenses/LICENSE-2.0', author='Apache Software Foundation', author_email='dev@flink.apache.org', - python_requires='>=3.6', - install_requires=['py4j==0.10.8.1', 'python-dateutil==2.8.0', 'apache-beam==2.27.0', + python_requires='>3.6', + install_requires=['py4j>=0.10.8.1,<=0.10.9.5', 'python-dateutil>=2.8.1', 'apache-beam==2.30.0+lyft202205161652748117', 'cloudpickle==1.2.2', 'avro-python3>=1.8.1,!=1.9.2,<1.10.0', - 'pandas>=1.0,<1.2.0', 'pyarrow>=0.15.1,<3.0.0', + 'pandas>=1.0,<1.2.0', 'pyarrow>=0.15.1,<=8.0.0', 'pytz>=2018.3', 'numpy>=1.14.3,<1.20', 'fastavro>=0.21.4,<0.24', apache_flink_libraries_dependency], cmdclass={'build_ext': build_ext}, From 571c3e122870cfbff39d39692304a20b2fb77f51 Mon Sep 17 00:00:00 2001 From: ravimagham Date: Mon, 20 Jun 2022 13:39:55 -0700 Subject: [PATCH 03/27] [backport][FLINK-26846][python] Fix the gauge metric --- .../apache/flink/python/metric/FlinkMetricContainer.java | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/flink-python/src/main/java/org/apache/flink/python/metric/FlinkMetricContainer.java b/flink-python/src/main/java/org/apache/flink/python/metric/FlinkMetricContainer.java index 96e127faf2782..a184c39c6211e 100644 --- a/flink-python/src/main/java/org/apache/flink/python/metric/FlinkMetricContainer.java +++ b/flink-python/src/main/java/org/apache/flink/python/metric/FlinkMetricContainer.java @@ -33,7 +33,6 @@ import org.apache.beam.model.pipeline.v1.MetricsApi.MonitoringInfo; import org.apache.beam.runners.core.metrics.MetricsContainerImpl; import org.apache.beam.runners.core.metrics.MetricsContainerStepMap; -import org.apache.beam.runners.core.metrics.MonitoringInfoConstants; import org.apache.beam.runners.core.metrics.MonitoringInfoMetricName; import org.apache.beam.sdk.metrics.DistributionResult; import org.apache.beam.sdk.metrics.GaugeResult; @@ -106,10 +105,7 @@ private boolean isUserMetric(MetricResult metricResult) { MetricName metricName = metricResult.getKey().metricName(); if (metricName instanceof MonitoringInfoMetricName) { String urn = ((MonitoringInfoMetricName) metricName).getUrn(); - return urn.contains(MonitoringInfoConstants.Urns.USER_SUM_INT64) - || urn.contains(MonitoringInfoConstants.Urns.USER_SUM_DOUBLE) - || urn.contains(MonitoringInfoConstants.Urns.USER_DISTRIBUTION_DOUBLE) - || urn.contains(MonitoringInfoConstants.Urns.USER_DISTRIBUTION_INT64); + return urn.startsWith("beam:metric:user"); } return false; } From ec31f69d0cff88c69942ab027277f9b6f48264ad Mon Sep 17 00:00:00 2001 From: ravimagham Date: Mon, 20 Jun 2022 13:46:57 -0700 Subject: [PATCH 04/27] [backport][FLINK-10052][ha] Tolerate temporarily suspended ZooKeeper connections --- .../ZooKeeperLeaderElectionDriver.java | 5 +- .../ZooKeeperLeaderRetrievalDriver.java | 4 +- .../flink/runtime/util/ZooKeeperUtils.java | 4 + ...eeperLeaderElectionConnectionLossTest.java | 139 ++++++++++++++++++ 4 files changed, 145 insertions(+), 7 deletions(-) create mode 100644 flink-runtime/src/test/java/org/apache/flink/runtime/leaderelection/ZooKeeperLeaderElectionConnectionLossTest.java diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/leaderelection/ZooKeeperLeaderElectionDriver.java b/flink-runtime/src/main/java/org/apache/flink/runtime/leaderelection/ZooKeeperLeaderElectionDriver.java index 855ebca247a38..663418acff89b 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/leaderelection/ZooKeeperLeaderElectionDriver.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/leaderelection/ZooKeeperLeaderElectionDriver.java @@ -273,10 +273,7 @@ private void handleStateChange(ConnectionState newState) { LOG.debug("Connected to ZooKeeper quorum. Leader election can start."); break; case SUSPENDED: - LOG.warn( - "Connection to ZooKeeper suspended. The contender " - + leaderContenderDescription - + " no longer participates in the leader election."); + LOG.warn("Connection to ZooKeeper suspended, waiting for reconnection."); break; case RECONNECTED: LOG.info( diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/leaderretrieval/ZooKeeperLeaderRetrievalDriver.java b/flink-runtime/src/main/java/org/apache/flink/runtime/leaderretrieval/ZooKeeperLeaderRetrievalDriver.java index f447a17e1ce4e..73576a99380d0 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/leaderretrieval/ZooKeeperLeaderRetrievalDriver.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/leaderretrieval/ZooKeeperLeaderRetrievalDriver.java @@ -155,9 +155,7 @@ private void handleStateChange(ConnectionState newState) { LOG.debug("Connected to ZooKeeper quorum. Leader retrieval can start."); break; case SUSPENDED: - LOG.warn( - "Connection to ZooKeeper suspended. Can no longer retrieve the leader from " - + "ZooKeeper."); + LOG.warn("Connection to ZooKeeper suspended, waiting for reconnection."); leaderRetrievalEventHandler.notifyLeaderAddress(LeaderInformation.empty()); break; case RECONNECTED: diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/util/ZooKeeperUtils.java b/flink-runtime/src/main/java/org/apache/flink/runtime/util/ZooKeeperUtils.java index 0af2248340ffe..c4fb14bac7402 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/util/ZooKeeperUtils.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/util/ZooKeeperUtils.java @@ -47,6 +47,9 @@ import org.apache.flink.runtime.persistence.RetrievableStateStorageHelper; import org.apache.flink.runtime.persistence.filesystem.FileSystemStateStorageHelper; import org.apache.flink.runtime.zookeeper.ZooKeeperStateHandleStore; + +import org.apache.flink.shaded.curator4.org.apache.curator.framework.state.SessionConnectionStateErrorPolicy; + import org.apache.flink.util.Preconditions; import org.apache.flink.shaded.curator4.org.apache.curator.framework.CuratorFramework; @@ -150,6 +153,7 @@ public static CuratorFramework startCuratorFramework(Configuration configuration .sessionTimeoutMs(sessionTimeout) .connectionTimeoutMs(connectionTimeout) .retryPolicy(new ExponentialBackoffRetry(retryWait, maxRetryAttempts)) + .connectionStateErrorPolicy(new SessionConnectionStateErrorPolicy()) // Curator prepends a '/' manually and throws an Exception if the // namespace starts with a '/'. .namespace( diff --git a/flink-runtime/src/test/java/org/apache/flink/runtime/leaderelection/ZooKeeperLeaderElectionConnectionLossTest.java b/flink-runtime/src/test/java/org/apache/flink/runtime/leaderelection/ZooKeeperLeaderElectionConnectionLossTest.java new file mode 100644 index 0000000000000..4a5e4cece79b9 --- /dev/null +++ b/flink-runtime/src/test/java/org/apache/flink/runtime/leaderelection/ZooKeeperLeaderElectionConnectionLossTest.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.leaderelection; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.HighAvailabilityOptions; +import org.apache.flink.core.testutils.OneShotLatch; +import org.apache.flink.runtime.util.ZooKeeperUtils; +import org.apache.flink.runtime.zookeeper.ZooKeeperResource; +import org.apache.flink.util.TestLogger; + +import org.apache.flink.shaded.curator4.org.apache.curator.framework.CuratorFramework; +import org.apache.flink.shaded.curator4.org.apache.curator.framework.state.ConnectionState; +import org.apache.flink.shaded.curator4.org.apache.curator.framework.state.ConnectionStateListener; + +import org.apache.zookeeper.KeeperException.ConnectionLossException; +import org.junit.Rule; +import org.junit.Test; + +import java.time.Duration; +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.fail; + +/** Test behaviors of {@link ZooKeeperLeaderElectionDriver} on {@link ConnectionLossException} */ +public class ZooKeeperLeaderElectionConnectionLossTest extends TestLogger { + + private static final String LATCH_PATH = "/latch"; + private static final String LEADER_PATH = "/leader"; + + private static final Duration TIMEOUT = Duration.ofMillis(2000L); + + @Rule public final ZooKeeperResource zooKeeperResource = new ZooKeeperResource(); + + @Test + public void testKeepLeadershipOnConnectionLoss() throws Exception { + final Configuration configuration = new Configuration(); + configuration.setString( + HighAvailabilityOptions.HA_ZOOKEEPER_QUORUM, zooKeeperResource.getConnectString()); + + CuratorFramework client = ZooKeeperUtils.startCuratorFramework(configuration); + LeaderElectionDriverFactory leaderElectionDriverFactory = + new ZooKeeperLeaderElectionDriverFactory(client, LATCH_PATH, LEADER_PATH); + DefaultLeaderElectionService leaderElectionService = + new DefaultLeaderElectionService(leaderElectionDriverFactory); + + try { + final OneShotLatch connectionLossLatch = new OneShotLatch(); + final OneShotLatch reconnectedLatch = new OneShotLatch(); + client.getConnectionStateListenable() + .addListener( + new TestingConnectionStateListener( + connectionLossLatch, reconnectedLatch)); + + final OneShotLatch grantLeadershipLatch = new OneShotLatch(); + final OneShotLatch revokeLeadershipLatch = new OneShotLatch(); + leaderElectionService.start( + new TestingContender(grantLeadershipLatch, revokeLeadershipLatch)); + + grantLeadershipLatch.await(TIMEOUT.toMillis(), TimeUnit.MILLISECONDS); + zooKeeperResource.restart(); + connectionLossLatch.await(TIMEOUT.toMillis(), TimeUnit.MILLISECONDS); + reconnectedLatch.await(TIMEOUT.toMillis(), TimeUnit.MILLISECONDS); + assertFalse(revokeLeadershipLatch.isTriggered()); + } finally { + leaderElectionService.stop(); + client.close(); + } + } + + private static final class TestingContender implements LeaderContender { + + private final OneShotLatch grantLeadershipLatch; + private final OneShotLatch revokeLeadershipLatch; + + public TestingContender( + OneShotLatch grantLeadershipLatch, OneShotLatch revokeLeadershipLatch) { + this.grantLeadershipLatch = grantLeadershipLatch; + this.revokeLeadershipLatch = revokeLeadershipLatch; + } + + @Override + public void grantLeadership(UUID leaderSessionID) { + grantLeadershipLatch.trigger(); + } + + @Override + public void revokeLeadership() { + revokeLeadershipLatch.trigger(); + } + + @Override + public void handleError(Exception exception) { + exception.printStackTrace(); + fail(exception.getMessage()); + } + } + + private static final class TestingConnectionStateListener implements ConnectionStateListener { + private final OneShotLatch connectionLossLatch; + private final OneShotLatch reconnectedLatch; + + public TestingConnectionStateListener( + OneShotLatch connectionLossLatch, OneShotLatch reconnectedLatch) { + this.connectionLossLatch = connectionLossLatch; + this.reconnectedLatch = reconnectedLatch; + } + + @Override + public void stateChanged( + CuratorFramework curatorFramework, ConnectionState connectionState) { + if (connectionState == ConnectionState.SUSPENDED) { + connectionLossLatch.trigger(); + } + + if (connectionState == ConnectionState.RECONNECTED) { + reconnectedLatch.trigger(); + } + } + } +} From 511f5b77b3b22dcd6d8d73c6c88473453c72de11 Mon Sep 17 00:00:00 2001 From: ravimagham Date: Mon, 20 Jun 2022 13:55:44 -0700 Subject: [PATCH 05/27] [backport][FLINK-25437][python] Correct grpcio dependency version in dev-requirenment.txt --- flink-python/dev/dev-requirements.txt | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/flink-python/dev/dev-requirements.txt b/flink-python/dev/dev-requirements.txt index a07b20072c62e..e3f3be36d9823 100755 --- a/flink-python/dev/dev-requirements.txt +++ b/flink-python/dev/dev-requirements.txt @@ -16,3 +16,16 @@ setuptools>=18.0 wheel apache-beam==2.30.0+lyft202205161652748117 cython==0.28.1 +py4j==0.10.8.1 +python-dateutil==2.8.0 +cloudpickle==1.2.2 +avro-python3>=1.8.1,!=1.9.2,<1.10.0 +pandas>=1.0,<1.2.0 +pyarrow>=0.15.1,<8.0.0 +pytz>=2018.3 +numpy>=1.14.3,<1.20 +fastavro>=0.21.4,<0.24 +grpcio>=1.29.0,<2 +grpcio-tools>=1.3.5,<=1.14.2 +requests>=2.26.0 +protobuf<3.18 From f818b11490c7593cbd65f1de1410266d284e7f5a Mon Sep 17 00:00:00 2001 From: ravimagham Date: Mon, 20 Jun 2022 14:05:58 -0700 Subject: [PATCH 06/27] [backport][FLINK-24049][python] Handle properly for field types need conversion in TupleTypeInfo --- flink-python/pyflink/common/typeinfo.py | 36 +++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/flink-python/pyflink/common/typeinfo.py b/flink-python/pyflink/common/typeinfo.py index 8467241da8f4e..aa68c5995cfd2 100644 --- a/flink-python/pyflink/common/typeinfo.py +++ b/flink-python/pyflink/common/typeinfo.py @@ -425,6 +425,9 @@ class TupleTypeInfo(TypeInformation): def __init__(self, field_types: List[TypeInformation]): self._field_types = field_types + self._need_conversion = [f.need_conversion() if isinstance(f, TypeInformation) else None + for f in self._field_types] + self._need_serialize_any_field = any(self._need_conversion) super(TupleTypeInfo, self).__init__() def get_field_types(self) -> List[TypeInformation]: @@ -445,6 +448,39 @@ def get_java_type_info(self) -> JavaObject: .org.apache.flink.api.java.typeutils.TupleTypeInfo(j_types_array) return self._j_typeinfo + def need_conversion(self): + return True + + def to_internal_type(self, obj): + if obj is None: + return + from pyflink.common import Row + if self._need_serialize_any_field: + # Only calling to_internal_type function for fields that need conversion + if isinstance(obj, (list, tuple, Row)): + return tuple( + f.to_internal_type(v) if c else v + for f, v, c in zip(self._field_types, obj, self._need_conversion)) + else: + raise ValueError("Unexpected tuple %r with TupleTypeInfo" % obj) + else: + if isinstance(obj, (list, tuple, Row)): + return tuple(obj) + else: + raise ValueError("Unexpected tuple %r with TupleTypeInfo" % obj) + + def from_internal_type(self, obj): + if obj is None or isinstance(obj, (tuple, list)): + # it's already converted by pickler + return obj + if self._need_serialize_any_field: + # Only calling from_internal_type function for fields that need conversion + values = [f.from_internal_type(v) if c else v + for f, v, c in zip(self._field_types, obj, self._need_conversion)] + else: + values = obj + return tuple(values) + def __eq__(self, other) -> bool: if isinstance(other, TupleTypeInfo): return self._field_types == other._field_types From 10c159c8cb7e634ea10b307dad764cc0c72622ac Mon Sep 17 00:00:00 2001 From: ravimagham Date: Mon, 20 Jun 2022 16:35:13 -0700 Subject: [PATCH 07/27] [LYFT][STRMCMP-1388] pyflink changes --- flink-python/dev/dev-requirements.txt | 13 ------------- .../apache/flink/runtime/util/ZooKeeperUtils.java | 4 +--- 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/flink-python/dev/dev-requirements.txt b/flink-python/dev/dev-requirements.txt index e3f3be36d9823..a07b20072c62e 100755 --- a/flink-python/dev/dev-requirements.txt +++ b/flink-python/dev/dev-requirements.txt @@ -16,16 +16,3 @@ setuptools>=18.0 wheel apache-beam==2.30.0+lyft202205161652748117 cython==0.28.1 -py4j==0.10.8.1 -python-dateutil==2.8.0 -cloudpickle==1.2.2 -avro-python3>=1.8.1,!=1.9.2,<1.10.0 -pandas>=1.0,<1.2.0 -pyarrow>=0.15.1,<8.0.0 -pytz>=2018.3 -numpy>=1.14.3,<1.20 -fastavro>=0.21.4,<0.24 -grpcio>=1.29.0,<2 -grpcio-tools>=1.3.5,<=1.14.2 -requests>=2.26.0 -protobuf<3.18 diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/util/ZooKeeperUtils.java b/flink-runtime/src/main/java/org/apache/flink/runtime/util/ZooKeeperUtils.java index c4fb14bac7402..22a1897e0e6ae 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/util/ZooKeeperUtils.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/util/ZooKeeperUtils.java @@ -47,9 +47,6 @@ import org.apache.flink.runtime.persistence.RetrievableStateStorageHelper; import org.apache.flink.runtime.persistence.filesystem.FileSystemStateStorageHelper; import org.apache.flink.runtime.zookeeper.ZooKeeperStateHandleStore; - -import org.apache.flink.shaded.curator4.org.apache.curator.framework.state.SessionConnectionStateErrorPolicy; - import org.apache.flink.util.Preconditions; import org.apache.flink.shaded.curator4.org.apache.curator.framework.CuratorFramework; @@ -57,6 +54,7 @@ import org.apache.flink.shaded.curator4.org.apache.curator.framework.api.ACLProvider; import org.apache.flink.shaded.curator4.org.apache.curator.framework.imps.DefaultACLProvider; import org.apache.flink.shaded.curator4.org.apache.curator.framework.recipes.cache.PathChildrenCache; +import org.apache.flink.shaded.curator4.org.apache.curator.framework.state.SessionConnectionStateErrorPolicy; import org.apache.flink.shaded.curator4.org.apache.curator.retry.ExponentialBackoffRetry; import org.apache.flink.shaded.zookeeper3.org.apache.zookeeper.ZooDefs; import org.apache.flink.shaded.zookeeper3.org.apache.zookeeper.data.ACL; From 2a55298023ac3bf15d960680b9d9193ad9272111 Mon Sep 17 00:00:00 2001 From: Konstantin Gizdarski Date: Mon, 18 Jul 2022 15:55:36 -0700 Subject: [PATCH 08/27] add options to enable lib --- flink-python/apache-flink-libraries/setup.cfg | 3 +++ 1 file changed, 3 insertions(+) diff --git a/flink-python/apache-flink-libraries/setup.cfg b/flink-python/apache-flink-libraries/setup.cfg index 138d2bd2c1505..f51e391997e51 100644 --- a/flink-python/apache-flink-libraries/setup.cfg +++ b/flink-python/apache-flink-libraries/setup.cfg @@ -21,3 +21,6 @@ universal = 1 [metadata] description-file = README.md + +[options] +python_requires= >=3.8 From c4f2a06a1b0bc0f8d92cbd37749175b218d535e7 Mon Sep 17 00:00:00 2001 From: Konstantin Gizdarski Date: Wed, 20 Jul 2022 15:15:59 -0700 Subject: [PATCH 09/27] test --- flink-python/apache-flink-libraries/setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/flink-python/apache-flink-libraries/setup.py b/flink-python/apache-flink-libraries/setup.py index d2b9521f65ac7..5e5af1fd40392 100644 --- a/flink-python/apache-flink-libraries/setup.py +++ b/flink-python/apache-flink-libraries/setup.py @@ -98,6 +98,8 @@ def find_file_path(pattern): file=sys.stderr) sys.exit(-1) flink_version = VERSION.replace(".dev0", "-SNAPSHOT") + flink_version = VERSION.replace("+", "-") + FLINK_HOME = os.path.abspath( "../../flink-dist/target/flink-%s-bin/flink-%s" % (flink_version, flink_version)) From 0ed52faa2115b8670d4dbbedf5bfba0b4f965a9b Mon Sep 17 00:00:00 2001 From: Konstantin Gizdarski Date: Wed, 20 Jul 2022 16:03:31 -0700 Subject: [PATCH 10/27] wip --- flink-python/apache-flink-libraries/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flink-python/apache-flink-libraries/setup.py b/flink-python/apache-flink-libraries/setup.py index 5e5af1fd40392..42b99b5dc7e66 100644 --- a/flink-python/apache-flink-libraries/setup.py +++ b/flink-python/apache-flink-libraries/setup.py @@ -98,7 +98,7 @@ def find_file_path(pattern): file=sys.stderr) sys.exit(-1) flink_version = VERSION.replace(".dev0", "-SNAPSHOT") - flink_version = VERSION.replace("+", "-") + flink_version = flink_version.replace("+", "-") FLINK_HOME = os.path.abspath( "../../flink-dist/target/flink-%s-bin/flink-%s" % (flink_version, flink_version)) From 9e632065fd7a08589437c88b5f92afbd98fdef6b Mon Sep 17 00:00:00 2001 From: Konstantin Gizdarski Date: Fri, 22 Jul 2022 15:26:16 -0700 Subject: [PATCH 11/27] [LYFT][DSP] Copy Hive JAR into apache-flink-libraries at setup time --- flink-python/apache-flink-libraries/setup.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/flink-python/apache-flink-libraries/setup.py b/flink-python/apache-flink-libraries/setup.py index d2b9521f65ac7..f9bc64e9a864e 100644 --- a/flink-python/apache-flink-libraries/setup.py +++ b/flink-python/apache-flink-libraries/setup.py @@ -119,6 +119,8 @@ def find_file_path(pattern): find_file_path(os.path.join(OPT_PATH, "flink-python_*.jar"))) OPT_SQL_CLIENT_JAR_NAME = os.path.basename( find_file_path(os.path.join(OPT_PATH, "flink-sql-client_*.jar"))) + OPT_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME = os.path.basename( + find_file_path(os.path.join(OPT_PATH, "flink-sql-connector-hive_3.1.2_*.jar"))) LICENSES_PATH = os.path.join(FLINK_HOME, "licenses") PLUGINS_PATH = os.path.join(FLINK_HOME, "plugins") SCRIPTS_PATH = os.path.join(FLINK_HOME, "bin") @@ -144,6 +146,8 @@ def find_file_path(pattern): os.path.join(OPT_TEMP_PATH, OPT_PYTHON_JAR_NAME)) os.symlink(os.path.join(OPT_PATH, OPT_SQL_CLIENT_JAR_NAME), os.path.join(OPT_TEMP_PATH, OPT_SQL_CLIENT_JAR_NAME)) + os.symlink(os.path.join(OPT_PATH, OPT_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME), + os.path.join(OPT_TEMP_PATH, OPT_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME)) os.symlink(PLUGINS_PATH, PLUGINS_TEMP_PATH) os.symlink(LICENSE_FILE_PATH, LICENSE_FILE_TEMP_PATH) os.symlink(README_FILE_PATH, README_FILE_TEMP_PATH) @@ -154,6 +158,8 @@ def find_file_path(pattern): os.path.join(OPT_TEMP_PATH, OPT_PYTHON_JAR_NAME)) copy(os.path.join(OPT_PATH, OPT_SQL_CLIENT_JAR_NAME), os.path.join(OPT_TEMP_PATH, OPT_SQL_CLIENT_JAR_NAME)) + copy(os.path.join(OPT_PATH, OPT_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME), + os.path.join(OPT_TEMP_PATH, OPT_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME)) copytree(PLUGINS_PATH, PLUGINS_TEMP_PATH) copy(LICENSE_FILE_PATH, LICENSE_FILE_TEMP_PATH) copy(README_FILE_PATH, README_FILE_TEMP_PATH) From 365ffa8ad0ab13d2f227cb8fd9d55bb8248805ea Mon Sep 17 00:00:00 2001 From: Adil Hafeez Date: Tue, 26 Jul 2022 12:18:33 -0700 Subject: [PATCH 12/27] update zk version to 3.5.6 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 226abad21dd54..a0afdeab01b1c 100644 --- a/pom.xml +++ b/pom.xml @@ -119,7 +119,7 @@ under the License. 2.11.12 2.11 0.7.6 - 3.4.14 + 3.5.6 2.12.0 2.12.1 From c199bf9223e8d3f60951faf020d23cda8f1e6710 Mon Sep 17 00:00:00 2001 From: Konstantin Gizdarski Date: Fri, 5 Aug 2022 16:40:03 -0700 Subject: [PATCH 13/27] Update setup.py --- flink-python/apache-flink-libraries/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flink-python/apache-flink-libraries/setup.py b/flink-python/apache-flink-libraries/setup.py index 0277a6dad91ab..f628f68f4423a 100644 --- a/flink-python/apache-flink-libraries/setup.py +++ b/flink-python/apache-flink-libraries/setup.py @@ -122,7 +122,7 @@ def find_file_path(pattern): OPT_SQL_CLIENT_JAR_NAME = os.path.basename( find_file_path(os.path.join(OPT_PATH, "flink-sql-client_*.jar"))) OPT_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME = os.path.basename( - find_file_path(os.path.join(OPT_PATH, "flink-sql-connector-hive_3.1.2_*.jar"))) + find_file_path(os.path.join(OPT_PATH, "flink-sql-connector-hive-3.1.2_*.jar"))) LICENSES_PATH = os.path.join(FLINK_HOME, "licenses") PLUGINS_PATH = os.path.join(FLINK_HOME, "plugins") SCRIPTS_PATH = os.path.join(FLINK_HOME, "bin") From 4d39752e0d44d0da75dd049fbeca204a6b178c28 Mon Sep 17 00:00:00 2001 From: ravimagham Date: Fri, 17 Dec 2021 03:22:38 -0800 Subject: [PATCH 14/27] [RTOP-645] [backport] Job and Execution status metrics --- .../generated/metric_configuration.html | 6 + .../flink/configuration/DescribedEnum.java | 46 +++ .../flink/configuration/MetricOptions.java | 87 ++++++ flink-runtime/pom.xml | 21 ++ .../executiongraph/DefaultExecutionGraph.java | 7 +- .../DefaultExecutionGraphBuilder.java | 2 - .../runtime/executiongraph/Execution.java | 2 +- .../ExecutionStateUpdateListener.java | 3 +- .../executiongraph/ExecutionVertex.java | 5 +- .../InternalExecutionGraphAccessor.java | 5 +- .../metrics/RestartTimeGauge.java | 80 ----- .../metrics/groups/JobManagerMetricGroup.java | 5 + .../DefaultExecutionGraphFactory.java | 8 +- .../scheduler/ExecutionGraphFactory.java | 2 + .../runtime/scheduler/JobStatusStore.java | 51 ++++ .../runtime/scheduler/SchedulerBase.java | 50 +++- .../scheduler/adaptive/AdaptiveScheduler.java | 59 +++- .../metrics/DeploymentStateTimeMetrics.java | 170 +++++++++++ .../scheduler/metrics/JobStatusMetrics.java | 113 +++++++ .../scheduler/metrics/MetricsRegistrar.java | 25 ++ .../scheduler/metrics/StateTimeMetric.java | 72 +++++ .../TestingDefaultExecutionGraphBuilder.java | 3 +- .../metrics/RestartTimeGaugeTest.java | 84 ------ .../jobmaster/slotpool/SlotPoolTestUtils.java | 8 + .../DefaultExecutionGraphFactoryTest.java | 2 + .../adaptive/AdaptiveSchedulerTest.java | 119 +++++++- .../scheduler/adaptive/ExecutingTest.java | 5 +- .../ExecutionStateTimeMetricsTest.java | 279 ++++++++++++++++++ .../metrics/JobStatusMetricsTest.java | 130 ++++++++ .../metrics/StateTimeMetricTest.java | 170 +++++++++++ .../flink-test-utils-junit/pom.xml | 1 + 31 files changed, 1420 insertions(+), 200 deletions(-) create mode 100644 flink-core/src/main/java/org/apache/flink/configuration/DescribedEnum.java delete mode 100644 flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/metrics/RestartTimeGauge.java create mode 100644 flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/JobStatusStore.java create mode 100644 flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/metrics/DeploymentStateTimeMetrics.java create mode 100644 flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/metrics/JobStatusMetrics.java create mode 100644 flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/metrics/MetricsRegistrar.java create mode 100644 flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/metrics/StateTimeMetric.java delete mode 100644 flink-runtime/src/test/java/org/apache/flink/runtime/executiongraph/metrics/RestartTimeGaugeTest.java create mode 100644 flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/metrics/ExecutionStateTimeMetricsTest.java create mode 100644 flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/metrics/JobStatusMetricsTest.java create mode 100644 flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/metrics/StateTimeMetricTest.java diff --git a/docs/layouts/shortcodes/generated/metric_configuration.html b/docs/layouts/shortcodes/generated/metric_configuration.html index b906b383b7e97..c0fbc8690544b 100644 --- a/docs/layouts/shortcodes/generated/metric_configuration.html +++ b/docs/layouts/shortcodes/generated/metric_configuration.html @@ -26,6 +26,12 @@ Integer The thread priority used for Flink's internal metric query service. The thread is created by Akka's thread pool executor. The range of the priority is from 1 (MIN_PRIORITY) to 10 (MAX_PRIORITY). Warning, increasing this value may bring the main Flink components down. + +
metrics.job.status.enable
+ CURRENT_TIME +

List<Enum>

+ The selection of job status metrics that should be reported.

Possible values:
  • "STATE": For a given state, return 1 if the job is currently in that state, otherwise return 0.
  • "CURRENT_TIME": For a given state, if the job is currently in that state, return the time since the job transitioned into that state, otherwise return 0.
  • "TOTAL_TIME": For a given state, return how much time the job has spent in that state in total.
+
metrics.latency.granularity
"operator" diff --git a/flink-core/src/main/java/org/apache/flink/configuration/DescribedEnum.java b/flink-core/src/main/java/org/apache/flink/configuration/DescribedEnum.java new file mode 100644 index 0000000000000..df5708179e332 --- /dev/null +++ b/flink-core/src/main/java/org/apache/flink/configuration/DescribedEnum.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.configuration; + +import org.apache.flink.annotation.PublicEvolving; +import org.apache.flink.configuration.description.Description; +import org.apache.flink.configuration.description.InlineElement; +import org.apache.flink.configuration.description.TextElement; + +/** + * Describe enum constants used in {@link ConfigOption}s. + * + *

For enums used as config options, this interface can be implemented to provide a {@link + * Description} for each enum constant. This will be used when generating documentation for config + * options to include a list of available values alongside their respective descriptions. + * + *

More precisely, only an {@link InlineElement} can be returned as block elements cannot be + * nested into a list. + */ +@PublicEvolving +public interface DescribedEnum { + + /** + * Returns the description for the enum constant. + * + *

If you want to include links or code blocks, use {@link + * TextElement#wrap(InlineElement...)} to wrap multiple inline elements into a single one. + */ + InlineElement getDescription(); +} diff --git a/flink-core/src/main/java/org/apache/flink/configuration/MetricOptions.java b/flink-core/src/main/java/org/apache/flink/configuration/MetricOptions.java index 0ba3cf56d6f47..ea32d5cc5f9d3 100644 --- a/flink-core/src/main/java/org/apache/flink/configuration/MetricOptions.java +++ b/flink-core/src/main/java/org/apache/flink/configuration/MetricOptions.java @@ -21,8 +21,11 @@ import org.apache.flink.annotation.PublicEvolving; import org.apache.flink.annotation.docs.Documentation; import org.apache.flink.configuration.description.Description; +import org.apache.flink.configuration.description.InlineElement; +import org.apache.flink.configuration.description.TextElement; import java.time.Duration; +import java.util.List; import static org.apache.flink.configuration.ConfigOptions.key; import static org.apache.flink.configuration.description.TextElement.text; @@ -215,5 +218,89 @@ public class MetricOptions { + "faster updating metrics. Increase this value if the metric fetcher causes too much load. Setting this value to 0 " + "disables the metric fetching completely."); + /** Controls which job status metrics will be exposed. */ + public static final ConfigOption> JOB_STATUS_METRICS = + key("metrics.job.status.enable") + .enumType(JobStatusMetrics.class) + .asList() + .defaultValues(JobStatusMetrics.CURRENT_TIME) + .withDescription( + "The selection of job status metrics that should be reported."); + + /** Enum describing the different kinds of job status metrics. */ + public enum JobStatusMetrics implements DescribedEnum { + STATE( + "For a given state, return 1 if the job is currently in that state, otherwise return 0."), + CURRENT_TIME( + "For a given state, if the job is currently in that state, return the time since the job transitioned into that state, otherwise return 0."), + TOTAL_TIME( + "For a given state, return how much time the job has spent in that state in total."), + ; + + private final String description; + + JobStatusMetrics(String description) { + this.description = description; + } + + @Override + public InlineElement getDescription() { + return TextElement.text(description); + } + } + + /** Describes which job status metrics have been enabled. */ + public static final class JobStatusMetricsSettings { + + private final boolean stateMetricsEnabled; + private final boolean currentTimeMetricsEnabled; + private final boolean totalTimeMetricsEnabled; + + private JobStatusMetricsSettings( + boolean stateMetricsEnabled, + boolean currentTimeMetricsEnabled, + boolean totalTimeMetricsEnabled) { + this.stateMetricsEnabled = stateMetricsEnabled; + this.currentTimeMetricsEnabled = currentTimeMetricsEnabled; + this.totalTimeMetricsEnabled = totalTimeMetricsEnabled; + } + + public boolean isStateMetricsEnabled() { + return stateMetricsEnabled; + } + + public boolean isCurrentTimeMetricsEnabled() { + return currentTimeMetricsEnabled; + } + + public boolean isTotalTimeMetricsEnabled() { + return totalTimeMetricsEnabled; + } + + public static JobStatusMetricsSettings fromConfiguration(Configuration configuration) { + final List jobStatusMetrics = configuration.get(JOB_STATUS_METRICS); + boolean stateMetricsEnabled = true; + boolean currentTimeMetricsEnabled = true; + boolean totalTimeMetricsEnabled = true; + + for (JobStatusMetrics jobStatusMetric : jobStatusMetrics) { + switch (jobStatusMetric) { + case STATE: + stateMetricsEnabled = true; + break; + case CURRENT_TIME: + currentTimeMetricsEnabled = true; + break; + case TOTAL_TIME: + totalTimeMetricsEnabled = true; + break; + } + } + + return new JobStatusMetricsSettings( + stateMetricsEnabled, currentTimeMetricsEnabled, totalTimeMetricsEnabled); + } + } + private MetricOptions() {} } diff --git a/flink-runtime/pom.xml b/flink-runtime/pom.xml index 7518af8158108..e5f9060a18a2d 100644 --- a/flink-runtime/pom.xml +++ b/flink-runtime/pom.xml @@ -321,6 +321,27 @@ under the License. oshi-core true + + + org.junit.jupiter + junit-jupiter-api + 5.5.2 + test + + + + org.junit.jupiter + junit-jupiter-engine + 5.5.2 + test + + + + org.assertj + assertj-core + 3.23.1 + compile + diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/DefaultExecutionGraph.java b/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/DefaultExecutionGraph.java index 26ac38f2d540c..c3a3d8cca24ee 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/DefaultExecutionGraph.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/DefaultExecutionGraph.java @@ -1441,8 +1441,11 @@ private void notifyJobStatusChange(JobStatus newState, Throwable error) { @Override public void notifyExecutionChange( - final Execution execution, final ExecutionState newExecutionState) { - executionStateUpdateListener.onStateUpdate(execution.getAttemptId(), newExecutionState); + final Execution execution, + ExecutionState previousState, + final ExecutionState newExecutionState) { + executionStateUpdateListener.onStateUpdate( + execution.getAttemptId(), previousState, newExecutionState); } private void assertRunningInJobMasterMainThread() { diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/DefaultExecutionGraphBuilder.java b/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/DefaultExecutionGraphBuilder.java index be546c4c56f9b..2a1287b647587 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/DefaultExecutionGraphBuilder.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/DefaultExecutionGraphBuilder.java @@ -39,7 +39,6 @@ import org.apache.flink.runtime.executiongraph.failover.flip1.partitionrelease.PartitionReleaseStrategy; import org.apache.flink.runtime.executiongraph.failover.flip1.partitionrelease.PartitionReleaseStrategyFactoryLoader; import org.apache.flink.runtime.executiongraph.metrics.DownTimeGauge; -import org.apache.flink.runtime.executiongraph.metrics.RestartTimeGauge; import org.apache.flink.runtime.executiongraph.metrics.UpTimeGauge; import org.apache.flink.runtime.io.network.partition.JobMasterPartitionTracker; import org.apache.flink.runtime.jobgraph.JobGraph; @@ -321,7 +320,6 @@ public static DefaultExecutionGraph buildGraph( // create all the metrics for the Execution Graph - metrics.gauge(RestartTimeGauge.METRIC_NAME, new RestartTimeGauge(executionGraph)); metrics.gauge(DownTimeGauge.METRIC_NAME, new DownTimeGauge(executionGraph)); metrics.gauge(UpTimeGauge.METRIC_NAME, new UpTimeGauge(executionGraph)); diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/Execution.java b/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/Execution.java index 10e073ffbc1d8..12efa83b38db4 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/Execution.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/Execution.java @@ -1464,7 +1464,7 @@ private boolean transitionState( // make sure that the state transition completes normally. // potential errors (in listeners may not affect the main logic) try { - vertex.notifyStateTransition(this, targetState); + vertex.notifyStateTransition(this, currentState, targetState); } catch (Throwable t) { LOG.error( "Error while notifying execution graph of execution state transition.", t); diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/ExecutionStateUpdateListener.java b/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/ExecutionStateUpdateListener.java index aa61bad18cd78..3f525e535fb7e 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/ExecutionStateUpdateListener.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/ExecutionStateUpdateListener.java @@ -21,5 +21,6 @@ /** A listener that is called when an execution switched to a new state. */ public interface ExecutionStateUpdateListener { - void onStateUpdate(ExecutionAttemptID execution, ExecutionState newState); + void onStateUpdate( + ExecutionAttemptID execution, ExecutionState previousState, ExecutionState newState); } diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/ExecutionVertex.java b/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/ExecutionVertex.java index 032e3d0b7726f..7b8920e4b18be 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/ExecutionVertex.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/ExecutionVertex.java @@ -546,11 +546,12 @@ void notifyCompletedDeployment(Execution execution) { } /** Simply forward this notification. */ - void notifyStateTransition(Execution execution, ExecutionState newState) { + void notifyStateTransition( + Execution execution, ExecutionState previousState, ExecutionState newState) { // only forward this notification if the execution is still the current execution // otherwise we have an outdated execution if (isCurrentExecution(execution)) { - getExecutionGraphAccessor().notifyExecutionChange(execution, newState); + getExecutionGraphAccessor().notifyExecutionChange(execution, previousState, newState); } } diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/InternalExecutionGraphAccessor.java b/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/InternalExecutionGraphAccessor.java index 0e184ea09c6ab..a0ac9b1f98b7d 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/InternalExecutionGraphAccessor.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/InternalExecutionGraphAccessor.java @@ -90,7 +90,10 @@ public interface InternalExecutionGraphAccessor { */ void failGlobal(Throwable t); - void notifyExecutionChange(final Execution execution, final ExecutionState newExecutionState); + void notifyExecutionChange( + final Execution execution, + ExecutionState previousState, + final ExecutionState newExecutionState); void notifySchedulerNgAboutInternalTaskFailure( ExecutionAttemptID attemptId, diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/metrics/RestartTimeGauge.java b/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/metrics/RestartTimeGauge.java deleted file mode 100644 index 6840f7ec79fda..0000000000000 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/metrics/RestartTimeGauge.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.flink.runtime.executiongraph.metrics; - -import org.apache.flink.api.common.JobStatus; -import org.apache.flink.metrics.Gauge; -import org.apache.flink.runtime.executiongraph.JobStatusProvider; - -import static org.apache.flink.util.Preconditions.checkNotNull; - -/** - * Gauge which returns the last restarting time. - * - *

Restarting time is the time between {@link JobStatus#RESTARTING} and {@link - * JobStatus#RUNNING}, or a terminal state if {@link JobStatus#RUNNING} was not reached. - * - *

If the job has not yet reached either of these states, then the time is measured since - * reaching {@link JobStatus#RESTARTING}. If it is still the initial job execution, then the gauge - * will return 0. - */ -public class RestartTimeGauge implements Gauge { - - public static final String METRIC_NAME = "restartingTime"; - - // ------------------------------------------------------------------------ - - private final JobStatusProvider jobStatusProvider; - - public RestartTimeGauge(JobStatusProvider jobStatusProvider) { - this.jobStatusProvider = checkNotNull(jobStatusProvider); - } - - // ------------------------------------------------------------------------ - - @Override - public Long getValue() { - final JobStatus status = jobStatusProvider.getState(); - - final long restartingTimestamp = jobStatusProvider.getStatusTimestamp(JobStatus.RESTARTING); - - final long switchToRunningTimestamp; - final long lastRestartTime; - - if (restartingTimestamp <= 0) { - // we haven't yet restarted our job - return 0L; - } else if ((switchToRunningTimestamp = - jobStatusProvider.getStatusTimestamp(JobStatus.RUNNING)) - >= restartingTimestamp) { - // we have transitioned to RUNNING since the last restart - lastRestartTime = switchToRunningTimestamp - restartingTimestamp; - } else if (status.isTerminalState()) { - // since the last restart we've switched to a terminal state without touching - // the RUNNING state (e.g. failing from RESTARTING) - lastRestartTime = jobStatusProvider.getStatusTimestamp(status) - restartingTimestamp; - } else { - // we're still somewhere between RESTARTING and RUNNING - lastRestartTime = System.currentTimeMillis() - restartingTimestamp; - } - - // we guard this with 'Math.max' to avoid negative timestamps when clocks re-sync - return Math.max(lastRestartTime, 0); - } -} diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/metrics/groups/JobManagerMetricGroup.java b/flink-runtime/src/main/java/org/apache/flink/runtime/metrics/groups/JobManagerMetricGroup.java index bb2cbe6a0ec9f..9f484cedd94a7 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/metrics/groups/JobManagerMetricGroup.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/metrics/groups/JobManagerMetricGroup.java @@ -48,6 +48,11 @@ public JobManagerMetricGroup(MetricRegistry registry, String hostname) { this.hostname = hostname; } + public static JobManagerMetricGroup createJobManagerMetricGroup( + final MetricRegistry metricRegistry, final String hostname) { + return new JobManagerMetricGroup(metricRegistry, hostname); + } + public String hostname() { return hostname; } diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/DefaultExecutionGraphFactory.java b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/DefaultExecutionGraphFactory.java index f1721c72b3a45..429d38d4e4ae1 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/DefaultExecutionGraphFactory.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/DefaultExecutionGraphFactory.java @@ -92,12 +92,14 @@ public ExecutionGraph createAndRestoreExecutionGraph( long initializationTimestamp, VertexAttemptNumberStore vertexAttemptNumberStore, VertexParallelismStore vertexParallelismStore, + ExecutionStateUpdateListener executionStateUpdateListener, Logger log) throws Exception { ExecutionDeploymentListener executionDeploymentListener = new ExecutionDeploymentTrackerDeploymentListenerAdapter(executionDeploymentTracker); - ExecutionStateUpdateListener executionStateUpdateListener = - (execution, newState) -> { + ExecutionStateUpdateListener combinedExecutionStateUpdateListener = + (execution, previousState, newState) -> { + executionStateUpdateListener.onStateUpdate(execution, previousState, newState); if (newState.isTerminal()) { executionDeploymentTracker.stopTrackingDeploymentOf(execution); } @@ -121,7 +123,7 @@ public ExecutionGraph createAndRestoreExecutionGraph( jobMasterPartitionTracker, partitionLocationConstraint, executionDeploymentListener, - executionStateUpdateListener, + combinedExecutionStateUpdateListener, initializationTimestamp, vertexAttemptNumberStore, vertexParallelismStore); diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/ExecutionGraphFactory.java b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/ExecutionGraphFactory.java index b38dc10a52796..eb3b535e980cb 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/ExecutionGraphFactory.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/ExecutionGraphFactory.java @@ -23,6 +23,7 @@ import org.apache.flink.runtime.checkpoint.CompletedCheckpointStore; import org.apache.flink.runtime.deployment.TaskDeploymentDescriptorFactory; import org.apache.flink.runtime.executiongraph.ExecutionGraph; +import org.apache.flink.runtime.executiongraph.ExecutionStateUpdateListener; import org.apache.flink.runtime.executiongraph.VertexAttemptNumberStore; import org.apache.flink.runtime.jobgraph.JobGraph; @@ -57,6 +58,7 @@ ExecutionGraph createAndRestoreExecutionGraph( long initializationTimestamp, VertexAttemptNumberStore vertexAttemptNumberStore, VertexParallelismStore vertexParallelismStore, + ExecutionStateUpdateListener executionStateUpdateListener, Logger log) throws Exception; } diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/JobStatusStore.java b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/JobStatusStore.java new file mode 100644 index 0000000000000..2a6ef9ffa65a9 --- /dev/null +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/JobStatusStore.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.scheduler; + +import org.apache.flink.api.common.JobID; +import org.apache.flink.api.common.JobStatus; +import org.apache.flink.runtime.executiongraph.JobStatusListener; +import org.apache.flink.runtime.executiongraph.JobStatusProvider; + +/** Listens for and exposes the current job state and state timestamps. */ +public class JobStatusStore implements JobStatusListener, JobStatusProvider { + + private final long[] stateTimestamps = new long[JobStatus.values().length]; + private JobStatus jobStatus = JobStatus.INITIALIZING; + + public JobStatusStore(long initializationTimestamp) { + stateTimestamps[JobStatus.INITIALIZING.ordinal()] = initializationTimestamp; + } + + @Override + public void jobStatusChanges( + JobID jobId, JobStatus newJobStatus, long timestamp, Throwable error) { + jobStatus = newJobStatus; + stateTimestamps[jobStatus.ordinal()] = timestamp; + } + + @Override + public JobStatus getState() { + return jobStatus; + } + + @Override + public long getStatusTimestamp(JobStatus status) { + return stateTimestamps[status.ordinal()]; + } +} diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/SchedulerBase.java b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/SchedulerBase.java index d6cd08781e483..5e100d09e8ba4 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/SchedulerBase.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/SchedulerBase.java @@ -25,7 +25,10 @@ import org.apache.flink.api.common.JobStatus; import org.apache.flink.configuration.CheckpointingOptions; import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.MetricOptions; import org.apache.flink.configuration.WebOptions; +import org.apache.flink.metrics.Gauge; +import org.apache.flink.metrics.MetricGroup; import org.apache.flink.queryablestate.KvStateID; import org.apache.flink.runtime.accumulators.AccumulatorSnapshot; import org.apache.flink.runtime.checkpoint.CheckpointCoordinator; @@ -51,8 +54,11 @@ import org.apache.flink.runtime.executiongraph.ExecutionJobVertex; import org.apache.flink.runtime.executiongraph.ExecutionVertex; import org.apache.flink.runtime.executiongraph.JobStatusListener; +import org.apache.flink.runtime.executiongraph.JobStatusProvider; import org.apache.flink.runtime.executiongraph.TaskExecutionStateTransition; import org.apache.flink.runtime.executiongraph.failover.flip1.ResultPartitionAvailabilityChecker; +import org.apache.flink.runtime.executiongraph.metrics.DownTimeGauge; +import org.apache.flink.runtime.executiongraph.metrics.UpTimeGauge; import org.apache.flink.runtime.io.network.partition.ResultPartitionID; import org.apache.flink.runtime.jobgraph.IntermediateDataSetID; import org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID; @@ -76,6 +82,8 @@ import org.apache.flink.runtime.query.UnknownKvStateLocation; import org.apache.flink.runtime.scheduler.exceptionhistory.FailureHandlingResultSnapshot; import org.apache.flink.runtime.scheduler.exceptionhistory.RootExceptionHistoryEntry; +import org.apache.flink.runtime.scheduler.metrics.DeploymentStateTimeMetrics; +import org.apache.flink.runtime.scheduler.metrics.JobStatusMetrics; import org.apache.flink.runtime.scheduler.stopwithsavepoint.StopWithSavepointTerminationHandlerImpl; import org.apache.flink.runtime.scheduler.stopwithsavepoint.StopWithSavepointTerminationManager; import org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID; @@ -107,6 +115,7 @@ import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletionException; import java.util.concurrent.Executor; +import java.util.function.Consumer; import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.StreamSupport; @@ -151,6 +160,10 @@ public abstract class SchedulerBase implements SchedulerNG, CheckpointScheduling private final ExecutionGraphFactory executionGraphFactory; + private final MetricOptions.JobStatusMetricsSettings jobStatusMetricsSettings; + + private final DeploymentStateTimeMetrics deploymentStateTimeMetrics; + public SchedulerBase( final Logger log, final JobGraph jobGraph, @@ -186,6 +199,11 @@ public SchedulerBase( SchedulerUtils.createCheckpointIDCounterIfCheckpointingIsEnabled( jobGraph, checkNotNull(checkpointRecoveryFactory)); + this.jobStatusMetricsSettings = + MetricOptions.JobStatusMetricsSettings.fromConfiguration(jobMasterConfiguration); + this.deploymentStateTimeMetrics = + new DeploymentStateTimeMetrics(jobGraph.getJobType(), jobStatusMetricsSettings); + this.executionGraph = createAndRestoreExecutionGraph( completedCheckpointStore, @@ -349,6 +367,7 @@ private ExecutionGraph createAndRestoreExecutionGraph( initializationTimestamp, new DefaultVertexAttemptNumberStore(), computeVertexParallelismStore(jobGraph), + deploymentStateTimeMetrics, log); newExecutionGraph.setInternalTaskFailuresListener( @@ -588,14 +607,37 @@ public ExecutionGraph getExecutionGraph() { @Override public final void startScheduling() { mainThreadExecutor.assertRunningInMainThread(); - registerJobMetrics(); + registerJobMetrics( + jobManagerJobMetricGroup, + executionGraph, + this::getNumberOfRestarts, + deploymentStateTimeMetrics, + executionGraph::registerJobStatusListener, + executionGraph.getStatusTimestamp(JobStatus.INITIALIZING), + jobStatusMetricsSettings); operatorCoordinatorHandler.startAllOperatorCoordinators(); startSchedulingInternal(); } - private void registerJobMetrics() { - jobManagerJobMetricGroup.gauge(MetricNames.NUM_RESTARTS, this::getNumberOfRestarts); - jobManagerJobMetricGroup.gauge(MetricNames.FULL_RESTARTS, this::getNumberOfRestarts); + public static void registerJobMetrics( + MetricGroup metrics, + JobStatusProvider jobStatusProvider, + Gauge numberOfRestarts, + DeploymentStateTimeMetrics deploymentTimeMetrics, + Consumer jobStatusListenerRegistrar, + long initializationTimestamp, + MetricOptions.JobStatusMetricsSettings jobStatusMetricsSettings) { + metrics.gauge(DownTimeGauge.METRIC_NAME, new DownTimeGauge(jobStatusProvider)); + metrics.gauge(UpTimeGauge.METRIC_NAME, new UpTimeGauge(jobStatusProvider)); + metrics.gauge(MetricNames.NUM_RESTARTS, numberOfRestarts); + metrics.gauge(MetricNames.FULL_RESTARTS, numberOfRestarts); + + final JobStatusMetrics jobStatusMetrics = + new JobStatusMetrics(initializationTimestamp, jobStatusMetricsSettings); + jobStatusMetrics.registerMetrics(metrics); + jobStatusListenerRegistrar.accept(jobStatusMetrics); + + deploymentTimeMetrics.registerMetrics(metrics); } protected abstract void startSchedulingInternal(); diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveScheduler.java b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveScheduler.java index 9a1a252fcedf7..88849e1720b64 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveScheduler.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveScheduler.java @@ -23,6 +23,7 @@ import org.apache.flink.api.common.JobStatus; import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.JobManagerOptions; +import org.apache.flink.configuration.MetricOptions; import org.apache.flink.configuration.SchedulerExecutionMode; import org.apache.flink.metrics.Gauge; import org.apache.flink.queryablestate.KvStateID; @@ -87,6 +88,7 @@ import org.apache.flink.runtime.scheduler.ExecutionGraphFactory; import org.apache.flink.runtime.scheduler.ExecutionGraphHandler; import org.apache.flink.runtime.scheduler.ExecutionGraphInfo; +import org.apache.flink.runtime.scheduler.JobStatusStore; import org.apache.flink.runtime.scheduler.OperatorCoordinatorHandler; import org.apache.flink.runtime.scheduler.SchedulerBase; import org.apache.flink.runtime.scheduler.SchedulerNG; @@ -99,6 +101,7 @@ import org.apache.flink.runtime.scheduler.adaptive.allocator.VertexParallelism; import org.apache.flink.runtime.scheduler.adaptive.scalingpolicy.ReactiveScaleUpController; import org.apache.flink.runtime.scheduler.adaptive.scalingpolicy.ScaleUpController; +import org.apache.flink.runtime.scheduler.metrics.DeploymentStateTimeMetrics; import org.apache.flink.runtime.state.KeyGroupRange; import org.apache.flink.runtime.util.ResourceCounter; import org.apache.flink.util.ExceptionUtils; @@ -116,7 +119,9 @@ import java.io.IOException; import java.net.InetSocketAddress; import java.time.Duration; +import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.Iterator; import java.util.Optional; import java.util.concurrent.CompletableFuture; @@ -178,7 +183,7 @@ public class AdaptiveScheduler private final ComponentMainThreadExecutor componentMainThreadExecutor; private final FatalErrorHandler fatalErrorHandler; - private final JobStatusListener jobStatusListener; + private final Collection jobStatusListeners; private final SlotAllocator slotAllocator; @@ -203,6 +208,10 @@ public class AdaptiveScheduler private final SchedulerExecutionMode executionMode; + private final JobStatusStore jobStatusStore; + + private final DeploymentStateTimeMetrics deploymentTimeMetrics; + public AdaptiveScheduler( JobGraph jobGraph, Configuration configuration, @@ -255,7 +264,6 @@ public AdaptiveScheduler( declarativeSlotPool.registerNewSlotsListener(this::newResourcesAvailable); this.componentMainThreadExecutor = mainThreadExecutor; - this.jobStatusListener = jobStatusListener; this.scaleUpController = new ReactiveScaleUpController(configuration); @@ -265,7 +273,30 @@ public AdaptiveScheduler( this.executionGraphFactory = executionGraphFactory; - registerMetrics(); + // registerMetrics(); + + this.jobStatusStore = new JobStatusStore(initializationTimestamp); + + final Collection tmpJobStatusListeners = new ArrayList<>(); + tmpJobStatusListeners.add(Preconditions.checkNotNull(jobStatusListener)); + tmpJobStatusListeners.add(jobStatusStore); + + final MetricOptions.JobStatusMetricsSettings jobStatusMetricsSettings = + MetricOptions.JobStatusMetricsSettings.fromConfiguration(configuration); + + this.deploymentTimeMetrics = + new DeploymentStateTimeMetrics(jobGraph.getJobType(), jobStatusMetricsSettings); + + SchedulerBase.registerJobMetrics( + jobManagerJobMetricGroup, + jobStatusStore, + () -> (long) numRestarts, + deploymentTimeMetrics, + tmpJobStatusListeners::add, + initializationTimestamp, + jobStatusMetricsSettings); + + this.jobStatusListeners = Collections.unmodifiableCollection(tmpJobStatusListeners); } private static void assertPreconditions(JobGraph jobGraph) throws RuntimeException { @@ -993,6 +1024,7 @@ private ExecutionGraph createExecutionGraphAndRestoreState( initializationTimestamp, vertexAttemptNumberStore, adjustedParallelismStore, + deploymentTimeMetrics, LOG); } @@ -1047,14 +1079,6 @@ public void onFinished(ArchivedExecutionGraph archivedExecutionGraph) { archivedExecutionGraph.getState(), optionalFailure); - if (jobStatusListener != null) { - jobStatusListener.jobStatusChanges( - jobInformation.getJobID(), - archivedExecutionGraph.getState(), - archivedExecutionGraph.getStatusTimestamp(archivedExecutionGraph.getState()), - optionalFailure); - } - jobTerminationFuture.complete(archivedExecutionGraph.getState()); } @@ -1135,9 +1159,22 @@ T transitionToState(StateFactory targetState) { state.getClass().getSimpleName(), targetState.getStateClass().getSimpleName()); + final JobStatus previousJobStatus = state.getJobStatus(); + state.onLeave(targetState.getStateClass()); T targetStateInstance = targetState.getState(); state = targetStateInstance; + + final JobStatus newJobStatus = state.getJobStatus(); + + if (previousJobStatus != newJobStatus) { + final long timestamp = System.currentTimeMillis(); + jobStatusListeners.forEach( + listener -> + listener.jobStatusChanges( + jobInformation.getJobID(), newJobStatus, timestamp, null)); + } + return targetStateInstance; } finally { isTransitioningState = false; diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/metrics/DeploymentStateTimeMetrics.java b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/metrics/DeploymentStateTimeMetrics.java new file mode 100644 index 0000000000000..5f93cad05c9cb --- /dev/null +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/metrics/DeploymentStateTimeMetrics.java @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.scheduler.metrics; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.MetricOptions; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.runtime.execution.ExecutionState; +import org.apache.flink.runtime.executiongraph.ExecutionAttemptID; +import org.apache.flink.runtime.executiongraph.ExecutionStateUpdateListener; +import org.apache.flink.runtime.jobgraph.JobType; +import org.apache.flink.util.clock.Clock; +import org.apache.flink.util.clock.SystemClock; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.HashSet; +import java.util.Set; +import java.util.function.Predicate; + +/** + * Metrics that capture how long a job was in SCHEDULED -> DEPLOYING -> INITIALIZING -> RUNNING + * + *

Measures from the start of the first deployment until all tasks have been deployed. From that + * point on checkpoints can be triggered, and thus progress be made. + */ +public class DeploymentStateTimeMetrics + implements ExecutionStateUpdateListener, StateTimeMetric, MetricsRegistrar { + + private static final Logger LOG = LoggerFactory.getLogger(DeploymentStateTimeMetrics.class); + + private static final long NOT_STARTED = -1L; + + private final Predicate deploymentEndPredicate; + private final MetricOptions.JobStatusMetricsSettings stateTimeMetricsSettings; + private final Clock clock; + + // deployment book-keeping + private final Set expectedDeployments = new HashSet<>(); + private int pendingDeployments = 0; + private int completedDeployments = 0; + + // metrics state + private long deploymentStart = NOT_STARTED; + private long deploymentTimeTotal = 0L; + + public DeploymentStateTimeMetrics( + JobType semantic, MetricOptions.JobStatusMetricsSettings stateTimeMetricsSettings) { + this(semantic, stateTimeMetricsSettings, SystemClock.getInstance()); + } + + @VisibleForTesting + DeploymentStateTimeMetrics( + JobType semantic, + MetricOptions.JobStatusMetricsSettings stateTimeMetricsSettings, + Clock clock) { + this.stateTimeMetricsSettings = stateTimeMetricsSettings; + this.clock = clock; + deploymentEndPredicate = + completedDeployments -> completedDeployments == expectedDeployments.size(); + } + + @Override + public long getCurrentTime() { + return deploymentStart == NOT_STARTED + ? 0L + : Math.max(0, clock.absoluteTimeMillis() - deploymentStart); + } + + @Override + public long getTotalTime() { + return getCurrentTime() + deploymentTimeTotal; + } + + @Override + public long getBinary() { + return deploymentStart == NOT_STARTED ? 0L : 1L; + } + + @Override + public void registerMetrics(MetricGroup metricGroup) { + StateTimeMetric.register(stateTimeMetricsSettings, metricGroup, this, "deploying"); + } + + @Override + public void onStateUpdate( + ExecutionAttemptID execution, ExecutionState previousState, ExecutionState newState) { + LOG.info( + "OnStateUpdate: previousState [{}], newState [{}]", + previousState.name(), + newState.name()); + switch (newState) { + case SCHEDULED: + expectedDeployments.add(execution); + pendingDeployments++; + break; + case DEPLOYING: + break; + case INITIALIZING: + break; + case RUNNING: + completedDeployments++; + break; + default: + // the deployment started terminating + expectedDeployments.remove(execution); + } + switch (previousState) { + case SCHEDULED: + pendingDeployments--; + break; + case DEPLOYING: + break; + case INITIALIZING: + break; + case RUNNING: + completedDeployments--; + break; + } + + if (deploymentStart == NOT_STARTED) { + if (pendingDeployments > 0) { + markDeploymentStart(); + } + } else { + if (deploymentEndPredicate.test(completedDeployments) + || expectedDeployments.isEmpty()) { + markDeploymentEnd(); + } + } + } + + private void markDeploymentStart() { + deploymentStart = clock.absoluteTimeMillis(); + } + + private void markDeploymentEnd() { + long deploymentTime = Math.max(0, clock.absoluteTimeMillis() - deploymentStart); + deploymentTimeTotal += deploymentTime; + LOG.info( + "The job: deploymentStartTime [{}], " + "deploymentTime [{}]", + deploymentStart, + deploymentTime); + deploymentStart = NOT_STARTED; + } + + @VisibleForTesting + boolean hasCleanState() { + return expectedDeployments.isEmpty() + && pendingDeployments == 0 + && completedDeployments == 0 + && deploymentStart == NOT_STARTED; + } +} diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/metrics/JobStatusMetrics.java b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/metrics/JobStatusMetrics.java new file mode 100644 index 0000000000000..312ae3493f9c5 --- /dev/null +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/metrics/JobStatusMetrics.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.scheduler.metrics; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.common.JobID; +import org.apache.flink.api.common.JobStatus; +import org.apache.flink.configuration.MetricOptions; +import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.runtime.executiongraph.JobStatusListener; +import org.apache.flink.util.clock.Clock; +import org.apache.flink.util.clock.SystemClock; + +import java.util.Locale; + +/** Metrics that capture the time that a job spends in each {@link JobStatus}. */ +public class JobStatusMetrics implements JobStatusListener, MetricsRegistrar { + + private final MetricOptions.JobStatusMetricsSettings jobStatusMetricsSettings; + private JobStatus currentStatus = JobStatus.INITIALIZING; + private long currentStatusTimestamp; + private final long[] cumulativeStatusTimes; + private final Clock clock; + + public JobStatusMetrics( + long initializationTimestamp, + MetricOptions.JobStatusMetricsSettings jobStatusMetricsSettings) { + this(initializationTimestamp, jobStatusMetricsSettings, SystemClock.getInstance()); + } + + @VisibleForTesting + JobStatusMetrics( + long initializationTimestamp, + MetricOptions.JobStatusMetricsSettings jobStatusMetricsSettings, + Clock clock) { + this.jobStatusMetricsSettings = jobStatusMetricsSettings; + this.clock = clock; + + currentStatus = JobStatus.INITIALIZING; + currentStatusTimestamp = initializationTimestamp; + cumulativeStatusTimes = new long[JobStatus.values().length]; + } + + @Override + public void registerMetrics(MetricGroup metricGroup) { + for (JobStatus jobStatus : JobStatus.values()) { + if (!jobStatus.isTerminalState() && jobStatus != JobStatus.RECONCILING) { + + final StateTimeMetric stateTimeMetric = createTimeMetric(jobStatus); + + StateTimeMetric.register( + jobStatusMetricsSettings, + metricGroup, + stateTimeMetric, + getBaseMetricName(jobStatus)); + } + } + } + + @VisibleForTesting + StateTimeMetric createTimeMetric(JobStatus jobStatus) { + return new StateTimeMetric() { + @Override + public long getCurrentTime() { + return currentStatus == jobStatus + ? Math.max(clock.absoluteTimeMillis() - currentStatusTimestamp, 0) + : 0; + } + + @Override + public long getTotalTime() { + return currentStatus == jobStatus + ? cumulativeStatusTimes[jobStatus.ordinal()] + + Math.max(clock.absoluteTimeMillis() - currentStatusTimestamp, 0) + : cumulativeStatusTimes[jobStatus.ordinal()]; + } + + @Override + public long getBinary() { + return currentStatus == jobStatus ? 1L : 0L; + } + }; + } + + @VisibleForTesting + static String getBaseMetricName(JobStatus jobStatus) { + return jobStatus.name().toLowerCase(Locale.ROOT); + } + + @Override + public void jobStatusChanges( + JobID jobId, JobStatus newJobStatus, long timestamp, Throwable error) { + cumulativeStatusTimes[currentStatus.ordinal()] += timestamp - currentStatusTimestamp; + + currentStatus = newJobStatus; + currentStatusTimestamp = timestamp; + } +} diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/metrics/MetricsRegistrar.java b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/metrics/MetricsRegistrar.java new file mode 100644 index 0000000000000..fb6c7d8f57c08 --- /dev/null +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/metrics/MetricsRegistrar.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.scheduler.metrics; + +import org.apache.flink.metrics.MetricGroup; + +/** A component that can register metrics. */ +public interface MetricsRegistrar { + void registerMetrics(MetricGroup metricGroup); +} diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/metrics/StateTimeMetric.java b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/metrics/StateTimeMetric.java new file mode 100644 index 0000000000000..c19b16d434135 --- /dev/null +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/metrics/StateTimeMetric.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.scheduler.metrics; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.configuration.MetricOptions; +import org.apache.flink.metrics.MetricGroup; + +/** Utility to define metrics that capture the time that some component spends in a state. */ +public interface StateTimeMetric { + + /** + * Returns the time, in milliseconds, that have elapsed since we transitioned to the targeted + * state. Returns 0 if we are not in the targeted state. + */ + long getCurrentTime(); + + /** Returns the total time, in milliseconds, that we have spent in the targeted state. */ + long getTotalTime(); + + /** Returns 1 if we are in the targeted state, otherwise 0. */ + long getBinary(); + + static void register( + MetricOptions.JobStatusMetricsSettings jobStatusMetricsSettings, + MetricGroup metricGroup, + StateTimeMetric stateTimeMetric, + String baseName) { + + if (jobStatusMetricsSettings.isStateMetricsEnabled()) { + metricGroup.gauge(getStateMetricName(baseName), stateTimeMetric::getBinary); + } + + if (jobStatusMetricsSettings.isCurrentTimeMetricsEnabled()) { + metricGroup.gauge(getCurrentTimeMetricName(baseName), stateTimeMetric::getCurrentTime); + } + + if (jobStatusMetricsSettings.isTotalTimeMetricsEnabled()) { + metricGroup.gauge(getTotalTimeMetricName(baseName), stateTimeMetric::getTotalTime); + } + } + + @VisibleForTesting + static String getStateMetricName(String baseName) { + return baseName + "State"; + } + + @VisibleForTesting + static String getCurrentTimeMetricName(String baseName) { + return baseName + "Time"; + } + + @VisibleForTesting + static String getTotalTimeMetricName(String baseName) { + return baseName + "TimeTotal"; + } +} diff --git a/flink-runtime/src/test/java/org/apache/flink/runtime/executiongraph/TestingDefaultExecutionGraphBuilder.java b/flink-runtime/src/test/java/org/apache/flink/runtime/executiongraph/TestingDefaultExecutionGraphBuilder.java index 1a4180e4ed79a..9d5d8fba64860 100644 --- a/flink-runtime/src/test/java/org/apache/flink/runtime/executiongraph/TestingDefaultExecutionGraphBuilder.java +++ b/flink-runtime/src/test/java/org/apache/flink/runtime/executiongraph/TestingDefaultExecutionGraphBuilder.java @@ -76,7 +76,8 @@ public static TestingDefaultExecutionGraphBuilder newBuilder() { private CheckpointIDCounter checkpointIdCounter = new StandaloneCheckpointIDCounter(); private ExecutionDeploymentListener executionDeploymentListener = NoOpExecutionDeploymentListener.get(); - private ExecutionStateUpdateListener executionStateUpdateListener = (execution, newState) -> {}; + private ExecutionStateUpdateListener executionStateUpdateListener = + (execution, currentState, newState) -> {}; private VertexParallelismStore vertexParallelismStore; private TestingDefaultExecutionGraphBuilder() {} diff --git a/flink-runtime/src/test/java/org/apache/flink/runtime/executiongraph/metrics/RestartTimeGaugeTest.java b/flink-runtime/src/test/java/org/apache/flink/runtime/executiongraph/metrics/RestartTimeGaugeTest.java deleted file mode 100644 index a927508f3dd10..0000000000000 --- a/flink-runtime/src/test/java/org/apache/flink/runtime/executiongraph/metrics/RestartTimeGaugeTest.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.flink.runtime.executiongraph.metrics; - -import org.apache.flink.api.common.JobStatus; -import org.apache.flink.runtime.executiongraph.TestingJobStatusProvider; -import org.apache.flink.util.TestLogger; - -import org.junit.Test; - -import java.util.HashMap; -import java.util.Map; - -import static org.hamcrest.Matchers.greaterThan; -import static org.hamcrest.Matchers.is; -import static org.junit.Assert.assertThat; - -/** Tests for {@link RestartTimeGauge}. */ -public class RestartTimeGaugeTest extends TestLogger { - - @Test - public void testNotRestarted() { - final RestartTimeGauge gauge = - new RestartTimeGauge(new TestingJobStatusProvider(JobStatus.RUNNING, -1)); - assertThat(gauge.getValue(), is(0L)); - } - - @Test - public void testInRestarting() { - final Map statusTimestampMap = new HashMap<>(); - statusTimestampMap.put(JobStatus.RESTARTING, 1L); - - final RestartTimeGauge gauge = - new RestartTimeGauge( - new TestingJobStatusProvider( - JobStatus.RESTARTING, - status -> statusTimestampMap.getOrDefault(status, -1L))); - assertThat(gauge.getValue(), greaterThan(0L)); - } - - @Test - public void testRunningAfterRestarting() { - final Map statusTimestampMap = new HashMap<>(); - statusTimestampMap.put(JobStatus.RESTARTING, 123L); - statusTimestampMap.put(JobStatus.RUNNING, 234L); - - final RestartTimeGauge gauge = - new RestartTimeGauge( - new TestingJobStatusProvider( - JobStatus.RUNNING, - status -> statusTimestampMap.getOrDefault(status, -1L))); - assertThat(gauge.getValue(), is(111L)); - } - - @Test - public void testFailedAfterRestarting() { - final Map statusTimestampMap = new HashMap<>(); - statusTimestampMap.put(JobStatus.RESTARTING, 123L); - statusTimestampMap.put(JobStatus.FAILED, 456L); - - final RestartTimeGauge gauge = - new RestartTimeGauge( - new TestingJobStatusProvider( - JobStatus.FAILED, - status -> statusTimestampMap.getOrDefault(status, -1L))); - assertThat(gauge.getValue(), is(333L)); - } -} diff --git a/flink-runtime/src/test/java/org/apache/flink/runtime/jobmaster/slotpool/SlotPoolTestUtils.java b/flink-runtime/src/test/java/org/apache/flink/runtime/jobmaster/slotpool/SlotPoolTestUtils.java index f83e7cdb2415e..95a6f2256344c 100644 --- a/flink-runtime/src/test/java/org/apache/flink/runtime/jobmaster/slotpool/SlotPoolTestUtils.java +++ b/flink-runtime/src/test/java/org/apache/flink/runtime/jobmaster/slotpool/SlotPoolTestUtils.java @@ -61,4 +61,12 @@ public static Collection offerSlots( return slotPool.offerSlots( slotOffers, new LocalTaskManagerLocation(), taskManagerGateway, 0); } + + @Nonnull + public static Collection offerSlots( + SlotPool slotPool, + Collection slotOffers, + TaskManagerGateway taskManagerGateway) { + return slotPool.offerSlots(new LocalTaskManagerLocation(), taskManagerGateway, slotOffers); + } } diff --git a/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/DefaultExecutionGraphFactoryTest.java b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/DefaultExecutionGraphFactoryTest.java index c838e93a9bb4c..fb1cd7474266e 100644 --- a/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/DefaultExecutionGraphFactoryTest.java +++ b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/DefaultExecutionGraphFactoryTest.java @@ -77,6 +77,7 @@ public void testRestoringModifiedJobFromSavepointFails() throws Exception { 0L, new DefaultVertexAttemptNumberStore(), SchedulerBase.computeVertexParallelismStore(jobGraphWithNewOperator), + (execution, previousState, newState) -> {}, log); fail("Expected ExecutionGraph creation to fail because of non restored state."); } catch (Exception e) { @@ -105,6 +106,7 @@ public void testRestoringModifiedJobFromSavepointWithAllowNonRestoredStateSuccee 0L, new DefaultVertexAttemptNumberStore(), SchedulerBase.computeVertexParallelismStore(jobGraphWithNewOperator), + (execution, previousState, newState) -> {}, log); final CompletedCheckpoint savepoint = completedCheckpointStore.getLatestCheckpoint(false); diff --git a/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveSchedulerTest.java b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveSchedulerTest.java index 3f8546749ddaa..41c483f389beb 100644 --- a/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveSchedulerTest.java +++ b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveSchedulerTest.java @@ -22,6 +22,7 @@ import org.apache.flink.api.common.time.Time; import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.JobManagerOptions; +import org.apache.flink.configuration.MetricOptions; import org.apache.flink.configuration.SchedulerExecutionMode; import org.apache.flink.metrics.Gauge; import org.apache.flink.runtime.checkpoint.CheckpointException; @@ -44,10 +45,13 @@ import org.apache.flink.runtime.executiongraph.TaskExecutionStateTransition; import org.apache.flink.runtime.executiongraph.failover.flip1.NoRestartBackoffTimeStrategy; import org.apache.flink.runtime.executiongraph.failover.flip1.TestRestartBackoffTimeStrategy; +import org.apache.flink.runtime.executiongraph.metrics.DownTimeGauge; +import org.apache.flink.runtime.executiongraph.metrics.UpTimeGauge; import org.apache.flink.runtime.executiongraph.utils.SimpleAckingTaskManagerGateway; import org.apache.flink.runtime.io.network.partition.ResultPartitionID; import org.apache.flink.runtime.jobgraph.IntermediateDataSetID; import org.apache.flink.runtime.jobgraph.JobGraph; +import org.apache.flink.runtime.jobgraph.JobGraphBuilder; import org.apache.flink.runtime.jobgraph.JobVertex; import org.apache.flink.runtime.jobgraph.OperatorID; import org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration; @@ -57,7 +61,7 @@ import org.apache.flink.runtime.jobmaster.slotpool.DefaultDeclarativeSlotPool; import org.apache.flink.runtime.metrics.MetricNames; import org.apache.flink.runtime.metrics.MetricRegistry; -import org.apache.flink.runtime.metrics.groups.JobManagerJobMetricGroup; +import org.apache.flink.runtime.metrics.groups.JobManagerMetricGroup; import org.apache.flink.runtime.metrics.util.TestingMetricRegistry; import org.apache.flink.runtime.operators.coordination.CoordinationRequest; import org.apache.flink.runtime.operators.coordination.TaskNotRunningException; @@ -89,6 +93,7 @@ import java.io.IOException; import java.time.Duration; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Optional; import java.util.concurrent.ArrayBlockingQueue; @@ -106,8 +111,8 @@ import static org.apache.flink.runtime.jobgraph.JobGraphTestUtils.streamingJobGraph; import static org.apache.flink.runtime.jobmaster.slotpool.DefaultDeclarativeSlotPoolTest.createSlotOffersForResourceRequirements; import static org.apache.flink.runtime.jobmaster.slotpool.SlotPoolTestUtils.offerSlots; -import static org.apache.flink.runtime.metrics.groups.UnregisteredMetricGroups.createUnregisteredJobManagerMetricGroup; import static org.hamcrest.Matchers.contains; +import static org.hamcrest.Matchers.greaterThan; import static org.hamcrest.Matchers.instanceOf; import static org.hamcrest.core.Is.is; import static org.junit.Assert.assertFalse; @@ -440,11 +445,12 @@ public void testNumRestartsMetric() throws Exception { new AdaptiveSchedulerBuilder(jobGraph, singleThreadMainThreadExecutor) .setJobMasterConfiguration(configuration) .setJobManagerJobMetricGroup( - new JobManagerJobMetricGroup( - metricRegistry, - createUnregisteredJobManagerMetricGroup(), - new JobID(), - "jobName")) + JobManagerMetricGroup.createJobManagerMetricGroup( + metricRegistry, "localhost") + .addJob( + JobGraphBuilder.newStreamingJobGraphBuilder() + .setJobName("jobName") + .build())) .setDeclarativeSlotPool(declarativeSlotPool) .build(); @@ -489,6 +495,105 @@ public void testNumRestartsMetric() throws Exception { assertThat(numRestartsMetric.getValue(), is(1)); } + @Test + public void testStatusMetrics() throws Exception { + final CompletableFuture upTimeMetricFuture = new CompletableFuture<>(); + final CompletableFuture downTimeMetricFuture = new CompletableFuture<>(); + final CompletableFuture> restartTimeMetricFuture = new CompletableFuture<>(); + final MetricRegistry metricRegistry = + TestingMetricRegistry.builder() + .setRegisterConsumer( + (metric, name, group) -> { + switch (name) { + case UpTimeGauge.METRIC_NAME: + upTimeMetricFuture.complete((UpTimeGauge) metric); + break; + case DownTimeGauge.METRIC_NAME: + downTimeMetricFuture.complete((DownTimeGauge) metric); + break; + case "restartingTimeTotal": + restartTimeMetricFuture.complete((Gauge) metric); + break; + } + }) + .build(); + + final JobGraph jobGraph = createJobGraph(); + + final DefaultDeclarativeSlotPool declarativeSlotPool = + createDeclarativeSlotPool(jobGraph.getJobID()); + + final Configuration configuration = new Configuration(); + configuration.set(JobManagerOptions.MIN_PARALLELISM_INCREASE, 1); + configuration.set(JobManagerOptions.RESOURCE_WAIT_TIMEOUT, Duration.ofMillis(10L)); + configuration.set( + MetricOptions.JOB_STATUS_METRICS, + Arrays.asList(MetricOptions.JobStatusMetrics.TOTAL_TIME)); + + final AdaptiveScheduler scheduler = + new AdaptiveSchedulerBuilder(jobGraph, singleThreadMainThreadExecutor) + .setJobMasterConfiguration(configuration) + .setJobManagerJobMetricGroup( + JobManagerMetricGroup.createJobManagerMetricGroup( + metricRegistry, "localhost") + .addJob( + JobGraphBuilder.newStreamingJobGraphBuilder() + .setJobName("jobName") + .build())) + .setDeclarativeSlotPool(declarativeSlotPool) + .build(); + + final UpTimeGauge upTimeGauge = upTimeMetricFuture.get(); + final DownTimeGauge downTimeGauge = downTimeMetricFuture.get(); + final Gauge restartTimeGauge = restartTimeMetricFuture.get(); + + final SubmissionBufferingTaskManagerGateway taskManagerGateway = + new SubmissionBufferingTaskManagerGateway(1 + PARALLELISM); + + taskManagerGateway.setCancelConsumer(createCancelConsumer(scheduler)); + + singleThreadMainThreadExecutor.execute( + () -> { + scheduler.startScheduling(); + + offerSlots( + declarativeSlotPool, + createSlotOffersForResourceRequirements( + ResourceCounter.withResource(ResourceProfile.UNKNOWN, 1)), + taskManagerGateway); + }); + + // wait for the first task submission + taskManagerGateway.waitForSubmissions(1, Duration.ofSeconds(5)); + + // sleep a bit to ensure uptime is > 0 + Thread.sleep(10L); + + assertThat(upTimeGauge.getValue(), greaterThan(0L)); + assertThat(downTimeGauge.getValue(), is(0L)); + assertThat(restartTimeGauge.getValue(), is(0L)); + + singleThreadMainThreadExecutor.execute( + () -> { + // offer more slots, which will cause a restart in order to scale up + offerSlots( + declarativeSlotPool, + createSlotOffersForResourceRequirements( + ResourceCounter.withResource(ResourceProfile.UNKNOWN, 1)), + taskManagerGateway); + }); + + // wait for the second task submissions + taskManagerGateway.waitForSubmissions(2, Duration.ofSeconds(5)); + + // sleep a bit to ensure uptime is > 0 + Thread.sleep(10L); + + assertThat(upTimeGauge.getValue(), greaterThan(0L)); + assertThat(downTimeGauge.getValue(), is(0L)); + assertThat(restartTimeGauge.getValue(), greaterThan(0L)); + } + // --------------------------------------------------------------------------------------------- // State transition tests // --------------------------------------------------------------------------------------------- diff --git a/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/ExecutingTest.java b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/ExecutingTest.java index ca0b041e62dd5..d392fbae50bd6 100644 --- a/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/ExecutingTest.java +++ b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/ExecutingTest.java @@ -909,7 +909,10 @@ public ExecutionDeploymentListener getExecutionDeploymentListener() { } @Override - public void notifyExecutionChange(Execution execution, ExecutionState newExecutionState) {} + public void notifyExecutionChange( + Execution execution, + ExecutionState previousState, + ExecutionState newExecutionState) {} @Override public EdgeManager getEdgeManager() { diff --git a/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/metrics/ExecutionStateTimeMetricsTest.java b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/metrics/ExecutionStateTimeMetricsTest.java new file mode 100644 index 0000000000000..e09d86d6c995e --- /dev/null +++ b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/metrics/ExecutionStateTimeMetricsTest.java @@ -0,0 +1,279 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.scheduler.metrics; + +import org.apache.flink.configuration.MetricOptions; +import org.apache.flink.runtime.execution.ExecutionState; +import org.apache.flink.runtime.executiongraph.ExecutionAttemptID; +import org.apache.flink.runtime.jobgraph.JobType; +import org.apache.flink.util.clock.ManualClock; + +import org.junit.jupiter.api.Test; + +import java.time.Duration; + +import static org.apache.flink.runtime.scheduler.metrics.StateTimeMetricTest.enable; +import static org.assertj.core.api.Assertions.assertThat; + +class ExecutionStateTimeMetricsTest { + + private static final MetricOptions.JobStatusMetricsSettings settings = + enable( + MetricOptions.JobStatusMetrics.STATE, + MetricOptions.JobStatusMetrics.CURRENT_TIME, + MetricOptions.JobStatusMetrics.TOTAL_TIME); + + @Test + void testInitialValues() { + final ManualClock clock = new ManualClock(Duration.ofMillis(5).toNanos()); + + final DeploymentStateTimeMetrics deploymentStateTimeMetrics = + new DeploymentStateTimeMetrics(JobType.BATCH, settings, clock); + + assertThat(deploymentStateTimeMetrics.getCurrentTime()).isEqualTo(0L); + assertThat(deploymentStateTimeMetrics.getTotalTime()).isEqualTo(0L); + assertThat(deploymentStateTimeMetrics.getBinary()).isEqualTo(0L); + } + + @Test + void testDeploymentStartsOnFirstDeploying() { + final DeploymentStateTimeMetrics metrics = + new DeploymentStateTimeMetrics(JobType.BATCH, settings); + + final ExecutionAttemptID id1 = new ExecutionAttemptID(); + + metrics.onStateUpdate(id1, ExecutionState.CREATED, ExecutionState.SCHEDULED); + assertThat(metrics.getBinary()).isEqualTo(0L); + + metrics.onStateUpdate(id1, ExecutionState.SCHEDULED, ExecutionState.DEPLOYING); + assertThat(metrics.getBinary()).isEqualTo(1L); + } + + @Test + void testDeploymentStart_batch_notTriggeredIfOneDeploymentIsRunning() { + final DeploymentStateTimeMetrics metrics = + new DeploymentStateTimeMetrics(JobType.BATCH, settings); + + final ExecutionAttemptID id1 = new ExecutionAttemptID(); + + metrics.onStateUpdate(id1, ExecutionState.CREATED, ExecutionState.SCHEDULED); + metrics.onStateUpdate(id1, ExecutionState.SCHEDULED, ExecutionState.DEPLOYING); + metrics.onStateUpdate(id1, ExecutionState.DEPLOYING, ExecutionState.INITIALIZING); + + final ExecutionAttemptID id2 = new ExecutionAttemptID(); + + metrics.onStateUpdate(id2, ExecutionState.CREATED, ExecutionState.SCHEDULED); + metrics.onStateUpdate(id2, ExecutionState.SCHEDULED, ExecutionState.DEPLOYING); + + assertThat(metrics.getBinary()).isEqualTo(0L); + } + + @Test + void testDeploymentEnd_batch() { + final DeploymentStateTimeMetrics metrics = + new DeploymentStateTimeMetrics(JobType.BATCH, settings); + + final ExecutionAttemptID id1 = new ExecutionAttemptID(); + final ExecutionAttemptID id2 = new ExecutionAttemptID(); + + metrics.onStateUpdate(id1, ExecutionState.CREATED, ExecutionState.SCHEDULED); + metrics.onStateUpdate(id2, ExecutionState.CREATED, ExecutionState.SCHEDULED); + + metrics.onStateUpdate(id1, ExecutionState.SCHEDULED, ExecutionState.DEPLOYING); + metrics.onStateUpdate(id2, ExecutionState.SCHEDULED, ExecutionState.DEPLOYING); + + metrics.onStateUpdate(id1, ExecutionState.DEPLOYING, ExecutionState.INITIALIZING); + assertThat(metrics.getBinary()).isEqualTo(0L); + } + + @Test + void testDeploymentEnd_streaming() { + final DeploymentStateTimeMetrics metrics = + new DeploymentStateTimeMetrics(JobType.STREAMING, settings); + + final ExecutionAttemptID id1 = new ExecutionAttemptID(); + final ExecutionAttemptID id2 = new ExecutionAttemptID(); + + metrics.onStateUpdate(id1, ExecutionState.CREATED, ExecutionState.SCHEDULED); + metrics.onStateUpdate(id2, ExecutionState.CREATED, ExecutionState.SCHEDULED); + + metrics.onStateUpdate(id1, ExecutionState.SCHEDULED, ExecutionState.DEPLOYING); + metrics.onStateUpdate(id2, ExecutionState.SCHEDULED, ExecutionState.DEPLOYING); + + metrics.onStateUpdate(id1, ExecutionState.DEPLOYING, ExecutionState.INITIALIZING); + assertThat(metrics.getBinary()).isEqualTo(1L); + + metrics.onStateUpdate(id2, ExecutionState.DEPLOYING, ExecutionState.INITIALIZING); + assertThat(metrics.getBinary()).isEqualTo(0L); + } + + @Test + void testDeploymentEnd_streaming_ignoresTerminalDeployments() { + final DeploymentStateTimeMetrics metrics = + new DeploymentStateTimeMetrics(JobType.STREAMING, settings); + + final ExecutionAttemptID id1 = new ExecutionAttemptID(); + final ExecutionAttemptID id2 = new ExecutionAttemptID(); + + metrics.onStateUpdate(id1, ExecutionState.CREATED, ExecutionState.SCHEDULED); + metrics.onStateUpdate(id2, ExecutionState.CREATED, ExecutionState.SCHEDULED); + + metrics.onStateUpdate(id1, ExecutionState.SCHEDULED, ExecutionState.DEPLOYING); + metrics.onStateUpdate(id2, ExecutionState.SCHEDULED, ExecutionState.DEPLOYING); + + metrics.onStateUpdate(id1, ExecutionState.DEPLOYING, ExecutionState.INITIALIZING); + metrics.onStateUpdate(id1, ExecutionState.INITIALIZING, ExecutionState.FINISHED); + assertThat(metrics.getBinary()).isEqualTo(1L); + + metrics.onStateUpdate(id2, ExecutionState.DEPLOYING, ExecutionState.INITIALIZING); + assertThat(metrics.getBinary()).isEqualTo(0L); + } + + @Test + void testGetCurrentTime() { + final ManualClock clock = new ManualClock(Duration.ofMillis(5).toNanos()); + + final DeploymentStateTimeMetrics metrics = + new DeploymentStateTimeMetrics(JobType.BATCH, settings, clock); + + final ExecutionAttemptID id1 = new ExecutionAttemptID(); + final ExecutionAttemptID id2 = new ExecutionAttemptID(); + + metrics.onStateUpdate(id1, ExecutionState.CREATED, ExecutionState.SCHEDULED); + metrics.onStateUpdate(id1, ExecutionState.SCHEDULED, ExecutionState.DEPLOYING); + + clock.advanceTime(Duration.ofMillis(5)); + assertThat(metrics.getCurrentTime()).isEqualTo(5L); + } + + @Test + void testGetCurrentTimeResetOndDeployentEnd() { + final ManualClock clock = new ManualClock(Duration.ofMillis(5).toNanos()); + + final DeploymentStateTimeMetrics metrics = + new DeploymentStateTimeMetrics(JobType.BATCH, settings, clock); + + final ExecutionAttemptID id1 = new ExecutionAttemptID(); + final ExecutionAttemptID id2 = new ExecutionAttemptID(); + + metrics.onStateUpdate(id1, ExecutionState.CREATED, ExecutionState.SCHEDULED); + metrics.onStateUpdate(id1, ExecutionState.SCHEDULED, ExecutionState.DEPLOYING); + metrics.onStateUpdate(id1, ExecutionState.DEPLOYING, ExecutionState.INITIALIZING); + + assertThat(metrics.getCurrentTime()).isEqualTo(0L); + } + + @Test + void testGetCurrentTime_notResetOnSecondaryDeployment() { + final ManualClock clock = new ManualClock(Duration.ofMillis(5).toNanos()); + + final DeploymentStateTimeMetrics metrics = + new DeploymentStateTimeMetrics(JobType.BATCH, settings, clock); + + final ExecutionAttemptID id1 = new ExecutionAttemptID(); + final ExecutionAttemptID id2 = new ExecutionAttemptID(); + + metrics.onStateUpdate(id1, ExecutionState.CREATED, ExecutionState.SCHEDULED); + metrics.onStateUpdate(id2, ExecutionState.CREATED, ExecutionState.SCHEDULED); + + metrics.onStateUpdate(id1, ExecutionState.SCHEDULED, ExecutionState.DEPLOYING); + clock.advanceTime(Duration.ofMillis(5)); + + metrics.onStateUpdate(id2, ExecutionState.SCHEDULED, ExecutionState.DEPLOYING); + clock.advanceTime(Duration.ofMillis(5)); + assertThat(metrics.getCurrentTime()).isEqualTo(10L); + } + + @Test + void testGetTotalTime() { + final ManualClock clock = new ManualClock(Duration.ofMillis(5).toNanos()); + + final DeploymentStateTimeMetrics metrics = + new DeploymentStateTimeMetrics(JobType.BATCH, settings, clock); + + final ExecutionAttemptID id1 = new ExecutionAttemptID(); + + metrics.onStateUpdate(id1, ExecutionState.CREATED, ExecutionState.SCHEDULED); + metrics.onStateUpdate(id1, ExecutionState.SCHEDULED, ExecutionState.DEPLOYING); + clock.advanceTime(Duration.ofMillis(5)); + metrics.onStateUpdate(id1, ExecutionState.DEPLOYING, ExecutionState.FINISHED); + assertThat(metrics.getTotalTime()).isEqualTo(5L); + + final ExecutionAttemptID id2 = new ExecutionAttemptID(); + + metrics.onStateUpdate(id2, ExecutionState.CREATED, ExecutionState.SCHEDULED); + metrics.onStateUpdate(id2, ExecutionState.SCHEDULED, ExecutionState.DEPLOYING); + clock.advanceTime(Duration.ofMillis(5)); + metrics.onStateUpdate(id2, ExecutionState.DEPLOYING, ExecutionState.FINISHED); + assertThat(metrics.getTotalTime()).isEqualTo(10L); + } + + @Test + void testGetTotalTimeIncludesCurrentTime() { + final ManualClock clock = new ManualClock(Duration.ofMillis(5).toNanos()); + + final DeploymentStateTimeMetrics metrics = + new DeploymentStateTimeMetrics(JobType.BATCH, settings, clock); + + final ExecutionAttemptID id1 = new ExecutionAttemptID(); + final ExecutionAttemptID id2 = new ExecutionAttemptID(); + + metrics.onStateUpdate(id1, ExecutionState.CREATED, ExecutionState.SCHEDULED); + metrics.onStateUpdate(id1, ExecutionState.SCHEDULED, ExecutionState.DEPLOYING); + + clock.advanceTime(Duration.ofMillis(5)); + assertThat(metrics.getTotalTime()).isEqualTo(5L); + } + + @Test + void testCleanStateAfterFullDeploymentCycle() { + final ManualClock clock = new ManualClock(Duration.ofMillis(5).toNanos()); + + final DeploymentStateTimeMetrics metrics = + new DeploymentStateTimeMetrics(JobType.BATCH, settings, clock); + + final ExecutionAttemptID id1 = new ExecutionAttemptID(); + final ExecutionAttemptID id2 = new ExecutionAttemptID(); + + metrics.onStateUpdate(id1, ExecutionState.CREATED, ExecutionState.SCHEDULED); + metrics.onStateUpdate(id1, ExecutionState.SCHEDULED, ExecutionState.DEPLOYING); + metrics.onStateUpdate(id1, ExecutionState.DEPLOYING, ExecutionState.INITIALIZING); + metrics.onStateUpdate(id1, ExecutionState.INITIALIZING, ExecutionState.RUNNING); + metrics.onStateUpdate(id1, ExecutionState.RUNNING, ExecutionState.CANCELING); + metrics.onStateUpdate(id1, ExecutionState.CANCELING, ExecutionState.CANCELED); + + assertThat(metrics.hasCleanState()).isEqualTo(true); + } + + @Test + void testCleanStateAfterEarlyDeploymentFailure() { + final ManualClock clock = new ManualClock(Duration.ofMillis(5).toNanos()); + + final DeploymentStateTimeMetrics metrics = + new DeploymentStateTimeMetrics(JobType.BATCH, settings, clock); + + final ExecutionAttemptID id1 = new ExecutionAttemptID(); + final ExecutionAttemptID id2 = new ExecutionAttemptID(); + + metrics.onStateUpdate(id1, ExecutionState.CREATED, ExecutionState.SCHEDULED); + metrics.onStateUpdate(id1, ExecutionState.SCHEDULED, ExecutionState.DEPLOYING); + metrics.onStateUpdate(id1, ExecutionState.DEPLOYING, ExecutionState.FAILED); + + assertThat(metrics.hasCleanState()).isEqualTo(true); + } +} diff --git a/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/metrics/JobStatusMetricsTest.java b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/metrics/JobStatusMetricsTest.java new file mode 100644 index 0000000000000..53cb020434012 --- /dev/null +++ b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/metrics/JobStatusMetricsTest.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.scheduler.metrics; + +import org.apache.flink.api.common.JobID; +import org.apache.flink.api.common.JobStatus; +import org.apache.flink.configuration.MetricOptions; +import org.apache.flink.runtime.metrics.util.InterceptingOperatorMetricGroup; +import org.apache.flink.util.clock.ManualClock; + +import org.junit.jupiter.api.Test; + +import java.time.Duration; +import java.util.Map; + +import static org.apache.flink.runtime.scheduler.metrics.StateTimeMetricTest.enable; +import static org.apache.flink.runtime.scheduler.metrics.StateTimeMetricTest.extractMetrics; +import static org.assertj.core.api.Assertions.assertThat; + +class JobStatusMetricsTest { + + @Test + void testStateMetric() { + final JobStatusMetrics jobStatusMetrics = + new JobStatusMetrics( + 0L, + enable( + MetricOptions.JobStatusMetrics.STATE, + MetricOptions.JobStatusMetrics.CURRENT_TIME, + MetricOptions.JobStatusMetrics.TOTAL_TIME)); + + final StateTimeMetric metric = jobStatusMetrics.createTimeMetric(JobStatus.RUNNING); + + assertThat(metric.getBinary()).isEqualTo(0L); + jobStatusMetrics.jobStatusChanges(new JobID(), JobStatus.RUNNING, 1L, null); + assertThat(metric.getBinary()).isEqualTo(1L); + jobStatusMetrics.jobStatusChanges(new JobID(), JobStatus.RESTARTING, 2L, null); + assertThat(metric.getBinary()).isEqualTo(0L); + } + + @Test + void testCurrentTimeMetric() { + final ManualClock clock = new ManualClock(); + final JobStatusMetrics jobStatusMetrics = + new JobStatusMetrics( + 0L, + enable( + MetricOptions.JobStatusMetrics.STATE, + MetricOptions.JobStatusMetrics.CURRENT_TIME, + MetricOptions.JobStatusMetrics.TOTAL_TIME), + clock); + final StateTimeMetric metric = jobStatusMetrics.createTimeMetric(JobStatus.RUNNING); + + assertThat(metric.getCurrentTime()).isEqualTo(0L); + jobStatusMetrics.jobStatusChanges(new JobID(), JobStatus.RUNNING, 1L, null); + clock.advanceTime(Duration.ofMillis(11)); + assertThat(metric.getCurrentTime()).isEqualTo(10L); + jobStatusMetrics.jobStatusChanges(new JobID(), JobStatus.RESTARTING, 15L, null); + assertThat(metric.getCurrentTime()).isEqualTo(0L); + } + + @Test + void testTotalTimeMetric() { + final ManualClock clock = new ManualClock(0); + final JobStatusMetrics jobStatusMetrics = + new JobStatusMetrics( + 0L, + enable( + MetricOptions.JobStatusMetrics.STATE, + MetricOptions.JobStatusMetrics.CURRENT_TIME, + MetricOptions.JobStatusMetrics.TOTAL_TIME), + clock); + + final StateTimeMetric metric = jobStatusMetrics.createTimeMetric(JobStatus.RUNNING); + + assertThat(metric.getTotalTime()).isEqualTo(0L); + + jobStatusMetrics.jobStatusChanges( + new JobID(), JobStatus.RUNNING, clock.absoluteTimeMillis(), null); + + clock.advanceTime(Duration.ofMillis(10)); + assertThat(metric.getTotalTime()).isEqualTo(10L); + + jobStatusMetrics.jobStatusChanges( + new JobID(), JobStatus.RESTARTING, clock.absoluteTimeMillis(), null); + + clock.advanceTime(Duration.ofMillis(4)); + assertThat(metric.getTotalTime()).isEqualTo(10L); + + jobStatusMetrics.jobStatusChanges( + new JobID(), JobStatus.RUNNING, clock.absoluteTimeMillis(), null); + + clock.advanceTime(Duration.ofMillis(1)); + assertThat(metric.getTotalTime()).isEqualTo(11L); + } + + @Test + void testStatusSelection() { + final InterceptingOperatorMetricGroup metricGroup = new InterceptingOperatorMetricGroup(); + + final JobStatusMetrics jobStatusMetrics = + new JobStatusMetrics(0L, enable(MetricOptions.JobStatusMetrics.STATE)); + jobStatusMetrics.registerMetrics(metricGroup); + final Map registeredMetrics = + extractMetrics(metricGroup); + + for (JobStatus value : JobStatus.values()) { + if (value.isTerminalState() || value == JobStatus.RECONCILING) { + assertThat(registeredMetrics).doesNotContainKey(value); + } else { + assertThat(registeredMetrics).containsKey(value); + } + } + } +} diff --git a/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/metrics/StateTimeMetricTest.java b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/metrics/StateTimeMetricTest.java new file mode 100644 index 0000000000000..f48d7f6130af8 --- /dev/null +++ b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/metrics/StateTimeMetricTest.java @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.scheduler.metrics; + +import org.apache.flink.api.common.JobStatus; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.MetricOptions; +import org.apache.flink.metrics.Gauge; +import org.apache.flink.runtime.metrics.util.InterceptingOperatorMetricGroup; + +import org.junit.jupiter.api.Test; + +import javax.annotation.Nullable; + +import java.util.Arrays; +import java.util.EnumMap; +import java.util.EnumSet; +import java.util.Map; +import java.util.Optional; + +import static org.assertj.core.api.Assertions.assertThat; + +class StateTimeMetricTest { + + @Test + void testEnableStateMetrics() { + testMetricSelection(MetricOptions.JobStatusMetrics.STATE); + } + + @Test + void testEnableCurrentTimeMetrics() { + testMetricSelection(MetricOptions.JobStatusMetrics.CURRENT_TIME); + } + + @Test + void testEnableTotalTimeMetrics() { + testMetricSelection(MetricOptions.JobStatusMetrics.TOTAL_TIME); + } + + @Test + void testEnableMultipleMetrics() { + testMetricSelection( + MetricOptions.JobStatusMetrics.CURRENT_TIME, + MetricOptions.JobStatusMetrics.TOTAL_TIME); + } + + private static void testMetricSelection(MetricOptions.JobStatusMetrics... selectedMetrics) { + final EnumSet selectedMetricsSet = + EnumSet.noneOf(MetricOptions.JobStatusMetrics.class); + Arrays.stream(selectedMetrics).forEach(selectedMetricsSet::add); + + final InterceptingOperatorMetricGroup metricGroup = new InterceptingOperatorMetricGroup(); + + StateTimeMetric.register( + enable(selectedMetrics), metricGroup, new TestStateTimeMetric(), "test"); + final Map registeredMetrics = extractMetrics(metricGroup); + + for (StatusMetricSet metrics : registeredMetrics.values()) { + assertThat(metrics.getState().isPresent()) + .isEqualTo(selectedMetricsSet.contains(MetricOptions.JobStatusMetrics.STATE)); + assertThat(metrics.getCurrentTime().isPresent()) + .isEqualTo( + selectedMetricsSet.contains( + MetricOptions.JobStatusMetrics.CURRENT_TIME)); + assertThat(metrics.getTotalTime().isPresent()) + .isEqualTo( + selectedMetricsSet.contains(MetricOptions.JobStatusMetrics.TOTAL_TIME)); + } + } + + static MetricOptions.JobStatusMetricsSettings enable( + MetricOptions.JobStatusMetrics... enabledMetrics) { + final Configuration configuration = new Configuration(); + + configuration.set(MetricOptions.JOB_STATUS_METRICS, Arrays.asList(enabledMetrics)); + + return MetricOptions.JobStatusMetricsSettings.fromConfiguration(configuration); + } + + static Map extractMetrics(InterceptingOperatorMetricGroup metrics) { + final Map extractedMetrics = new EnumMap<>(JobStatus.class); + + for (JobStatus jobStatus : JobStatus.values()) { + final String baseMetricName = JobStatusMetrics.getBaseMetricName(jobStatus); + final StatusMetricSet statusMetricSet = + new StatusMetricSet( + (Gauge) + metrics.get(StateTimeMetric.getStateMetricName(baseMetricName)), + (Gauge) + metrics.get( + StateTimeMetric.getCurrentTimeMetricName( + baseMetricName)), + (Gauge) + metrics.get( + StateTimeMetric.getTotalTimeMetricName( + baseMetricName))); + if (statusMetricSet.getState().isPresent() + || statusMetricSet.getCurrentTime().isPresent() + || statusMetricSet.getTotalTime().isPresent()) { + extractedMetrics.put(jobStatus, statusMetricSet); + } + } + + return extractedMetrics; + } + + private static class TestStateTimeMetric implements StateTimeMetric { + + @Override + public long getCurrentTime() { + return 2; + } + + @Override + public long getTotalTime() { + return 3; + } + + @Override + public long getBinary() { + return 1; + } + } + + static class StatusMetricSet { + + @Nullable private final Gauge state; + @Nullable private final Gauge currentTime; + @Nullable private final Gauge totalTime; + + private StatusMetricSet( + @Nullable Gauge state, + @Nullable Gauge currentTime, + @Nullable Gauge totalTime) { + this.state = state; + this.currentTime = currentTime; + this.totalTime = totalTime; + } + + @Nullable + public Optional> getState() { + return Optional.ofNullable(state); + } + + @Nullable + public Optional> getCurrentTime() { + return Optional.ofNullable(currentTime); + } + + @Nullable + public Optional> getTotalTime() { + return Optional.ofNullable(totalTime); + } + } +} diff --git a/flink-test-utils-parent/flink-test-utils-junit/pom.xml b/flink-test-utils-parent/flink-test-utils-junit/pom.xml index d203ea3d96822..ad5c762803352 100644 --- a/flink-test-utils-parent/flink-test-utils-junit/pom.xml +++ b/flink-test-utils-parent/flink-test-utils-junit/pom.xml @@ -56,6 +56,7 @@ under the License. log4j-core compile + From a134200b7aedb61dffa917464ec9b070075524ca Mon Sep 17 00:00:00 2001 From: Konstantin Gizdarski Date: Mon, 8 Aug 2022 23:12:46 -0700 Subject: [PATCH 15/27] update to write to lib --- flink-python/apache-flink-libraries/setup.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/flink-python/apache-flink-libraries/setup.py b/flink-python/apache-flink-libraries/setup.py index f628f68f4423a..575f791b6ff4e 100644 --- a/flink-python/apache-flink-libraries/setup.py +++ b/flink-python/apache-flink-libraries/setup.py @@ -122,7 +122,7 @@ def find_file_path(pattern): OPT_SQL_CLIENT_JAR_NAME = os.path.basename( find_file_path(os.path.join(OPT_PATH, "flink-sql-client_*.jar"))) OPT_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME = os.path.basename( - find_file_path(os.path.join(OPT_PATH, "flink-sql-connector-hive-3.1.2_*.jar"))) + find_file_path(os.path.join(OPT_PATH, "flink-sql-connector-hive-2.3.6_*.jar"))) LICENSES_PATH = os.path.join(FLINK_HOME, "licenses") PLUGINS_PATH = os.path.join(FLINK_HOME, "plugins") SCRIPTS_PATH = os.path.join(FLINK_HOME, "bin") @@ -148,8 +148,8 @@ def find_file_path(pattern): os.path.join(OPT_TEMP_PATH, OPT_PYTHON_JAR_NAME)) os.symlink(os.path.join(OPT_PATH, OPT_SQL_CLIENT_JAR_NAME), os.path.join(OPT_TEMP_PATH, OPT_SQL_CLIENT_JAR_NAME)) - os.symlink(os.path.join(OPT_PATH, OPT_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME), - os.path.join(OPT_TEMP_PATH, OPT_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME)) + os.symlink(os.path.join(LIB_PATH, OPT_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME), + os.path.join(LIB_TEMP_PATH, OPT_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME)) os.symlink(PLUGINS_PATH, PLUGINS_TEMP_PATH) os.symlink(LICENSE_FILE_PATH, LICENSE_FILE_TEMP_PATH) os.symlink(README_FILE_PATH, README_FILE_TEMP_PATH) @@ -160,8 +160,8 @@ def find_file_path(pattern): os.path.join(OPT_TEMP_PATH, OPT_PYTHON_JAR_NAME)) copy(os.path.join(OPT_PATH, OPT_SQL_CLIENT_JAR_NAME), os.path.join(OPT_TEMP_PATH, OPT_SQL_CLIENT_JAR_NAME)) - copy(os.path.join(OPT_PATH, OPT_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME), - os.path.join(OPT_TEMP_PATH, OPT_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME)) + copy(os.path.join(LIB_PATH, OPT_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME), + os.path.join(LIB_TEMP_PATH, OPT_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME)) copytree(PLUGINS_PATH, PLUGINS_TEMP_PATH) copy(LICENSE_FILE_PATH, LICENSE_FILE_TEMP_PATH) copy(README_FILE_PATH, README_FILE_TEMP_PATH) From db3f91bd32e9b7a45c14e0a4fb3d443e24191d59 Mon Sep 17 00:00:00 2001 From: Konstantin Gizdarski Date: Tue, 9 Aug 2022 00:22:16 -0700 Subject: [PATCH 16/27] lib path --- flink-python/apache-flink-libraries/setup.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/flink-python/apache-flink-libraries/setup.py b/flink-python/apache-flink-libraries/setup.py index 575f791b6ff4e..79d3cd2d87554 100644 --- a/flink-python/apache-flink-libraries/setup.py +++ b/flink-python/apache-flink-libraries/setup.py @@ -121,8 +121,8 @@ def find_file_path(pattern): find_file_path(os.path.join(OPT_PATH, "flink-python_*.jar"))) OPT_SQL_CLIENT_JAR_NAME = os.path.basename( find_file_path(os.path.join(OPT_PATH, "flink-sql-client_*.jar"))) - OPT_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME = os.path.basename( - find_file_path(os.path.join(OPT_PATH, "flink-sql-connector-hive-2.3.6_*.jar"))) + LIB_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME = os.path.basename( + find_file_path(os.path.join(LIB_PATH, "flink-sql-connector-hive-2.3.6_*.jar"))) LICENSES_PATH = os.path.join(FLINK_HOME, "licenses") PLUGINS_PATH = os.path.join(FLINK_HOME, "plugins") SCRIPTS_PATH = os.path.join(FLINK_HOME, "bin") @@ -148,8 +148,8 @@ def find_file_path(pattern): os.path.join(OPT_TEMP_PATH, OPT_PYTHON_JAR_NAME)) os.symlink(os.path.join(OPT_PATH, OPT_SQL_CLIENT_JAR_NAME), os.path.join(OPT_TEMP_PATH, OPT_SQL_CLIENT_JAR_NAME)) - os.symlink(os.path.join(LIB_PATH, OPT_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME), - os.path.join(LIB_TEMP_PATH, OPT_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME)) + os.symlink(os.path.join(LIB_PATH, LIB_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME), + os.path.join(LIB_TEMP_PATH, LIB_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME)) os.symlink(PLUGINS_PATH, PLUGINS_TEMP_PATH) os.symlink(LICENSE_FILE_PATH, LICENSE_FILE_TEMP_PATH) os.symlink(README_FILE_PATH, README_FILE_TEMP_PATH) @@ -160,8 +160,8 @@ def find_file_path(pattern): os.path.join(OPT_TEMP_PATH, OPT_PYTHON_JAR_NAME)) copy(os.path.join(OPT_PATH, OPT_SQL_CLIENT_JAR_NAME), os.path.join(OPT_TEMP_PATH, OPT_SQL_CLIENT_JAR_NAME)) - copy(os.path.join(LIB_PATH, OPT_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME), - os.path.join(LIB_TEMP_PATH, OPT_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME)) + copy(os.path.join(LIB_PATH, LIB_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME), + os.path.join(LIB_TEMP_PATH, LIB_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME)) copytree(PLUGINS_PATH, PLUGINS_TEMP_PATH) copy(LICENSE_FILE_PATH, LICENSE_FILE_TEMP_PATH) copy(README_FILE_PATH, README_FILE_TEMP_PATH) From e0654a81c17f975f5f9729d0eb2c015cca6c3bde Mon Sep 17 00:00:00 2001 From: Konstantin Gizdarski Date: Tue, 9 Aug 2022 01:22:26 -0700 Subject: [PATCH 17/27] unlink the symlink --- flink-python/apache-flink-libraries/setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/flink-python/apache-flink-libraries/setup.py b/flink-python/apache-flink-libraries/setup.py index 79d3cd2d87554..a8a1af05c09f0 100644 --- a/flink-python/apache-flink-libraries/setup.py +++ b/flink-python/apache-flink-libraries/setup.py @@ -148,6 +148,8 @@ def find_file_path(pattern): os.path.join(OPT_TEMP_PATH, OPT_PYTHON_JAR_NAME)) os.symlink(os.path.join(OPT_PATH, OPT_SQL_CLIENT_JAR_NAME), os.path.join(OPT_TEMP_PATH, OPT_SQL_CLIENT_JAR_NAME)) + # Ensure the path we are trying to symlink to does not exist. + os.unlink(os.path.join(LIB_TEMP_PATH, LIB_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME)) os.symlink(os.path.join(LIB_PATH, LIB_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME), os.path.join(LIB_TEMP_PATH, LIB_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME)) os.symlink(PLUGINS_PATH, PLUGINS_TEMP_PATH) From e762062192fb863a2ea3fb523b19f04d2511284c Mon Sep 17 00:00:00 2001 From: Konstantin Gizdarski Date: Tue, 9 Aug 2022 11:02:01 -0700 Subject: [PATCH 18/27] wip --- flink-python/apache-flink-libraries/setup.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/flink-python/apache-flink-libraries/setup.py b/flink-python/apache-flink-libraries/setup.py index a8a1af05c09f0..79d3cd2d87554 100644 --- a/flink-python/apache-flink-libraries/setup.py +++ b/flink-python/apache-flink-libraries/setup.py @@ -148,8 +148,6 @@ def find_file_path(pattern): os.path.join(OPT_TEMP_PATH, OPT_PYTHON_JAR_NAME)) os.symlink(os.path.join(OPT_PATH, OPT_SQL_CLIENT_JAR_NAME), os.path.join(OPT_TEMP_PATH, OPT_SQL_CLIENT_JAR_NAME)) - # Ensure the path we are trying to symlink to does not exist. - os.unlink(os.path.join(LIB_TEMP_PATH, LIB_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME)) os.symlink(os.path.join(LIB_PATH, LIB_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME), os.path.join(LIB_TEMP_PATH, LIB_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME)) os.symlink(PLUGINS_PATH, PLUGINS_TEMP_PATH) From 7678c39f9c1d297a45b19dcc7b8922ac88078e3d Mon Sep 17 00:00:00 2001 From: Konstantin Gizdarski Date: Tue, 9 Aug 2022 11:38:45 -0700 Subject: [PATCH 19/27] remove the explicit JAR target since this JAR should come from the lib directory --- flink-python/apache-flink-libraries/setup.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/flink-python/apache-flink-libraries/setup.py b/flink-python/apache-flink-libraries/setup.py index 79d3cd2d87554..42b99b5dc7e66 100644 --- a/flink-python/apache-flink-libraries/setup.py +++ b/flink-python/apache-flink-libraries/setup.py @@ -121,8 +121,6 @@ def find_file_path(pattern): find_file_path(os.path.join(OPT_PATH, "flink-python_*.jar"))) OPT_SQL_CLIENT_JAR_NAME = os.path.basename( find_file_path(os.path.join(OPT_PATH, "flink-sql-client_*.jar"))) - LIB_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME = os.path.basename( - find_file_path(os.path.join(LIB_PATH, "flink-sql-connector-hive-2.3.6_*.jar"))) LICENSES_PATH = os.path.join(FLINK_HOME, "licenses") PLUGINS_PATH = os.path.join(FLINK_HOME, "plugins") SCRIPTS_PATH = os.path.join(FLINK_HOME, "bin") @@ -148,8 +146,6 @@ def find_file_path(pattern): os.path.join(OPT_TEMP_PATH, OPT_PYTHON_JAR_NAME)) os.symlink(os.path.join(OPT_PATH, OPT_SQL_CLIENT_JAR_NAME), os.path.join(OPT_TEMP_PATH, OPT_SQL_CLIENT_JAR_NAME)) - os.symlink(os.path.join(LIB_PATH, LIB_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME), - os.path.join(LIB_TEMP_PATH, LIB_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME)) os.symlink(PLUGINS_PATH, PLUGINS_TEMP_PATH) os.symlink(LICENSE_FILE_PATH, LICENSE_FILE_TEMP_PATH) os.symlink(README_FILE_PATH, README_FILE_TEMP_PATH) @@ -160,8 +156,6 @@ def find_file_path(pattern): os.path.join(OPT_TEMP_PATH, OPT_PYTHON_JAR_NAME)) copy(os.path.join(OPT_PATH, OPT_SQL_CLIENT_JAR_NAME), os.path.join(OPT_TEMP_PATH, OPT_SQL_CLIENT_JAR_NAME)) - copy(os.path.join(LIB_PATH, LIB_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME), - os.path.join(LIB_TEMP_PATH, LIB_FLINK_SQL_CONNECTOR_HIVE_JAR_NAME)) copytree(PLUGINS_PATH, PLUGINS_TEMP_PATH) copy(LICENSE_FILE_PATH, LICENSE_FILE_TEMP_PATH) copy(README_FILE_PATH, README_FILE_TEMP_PATH) From e235648e3f59da7342a9997ee41c0937df288ae1 Mon Sep 17 00:00:00 2001 From: ravimagham Date: Tue, 9 Aug 2022 12:08:26 -0700 Subject: [PATCH 20/27] Revert "update zk version to 3.5.6" This reverts commit 365ffa8ad0ab13d2f227cb8fd9d55bb8248805ea. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index a0afdeab01b1c..226abad21dd54 100644 --- a/pom.xml +++ b/pom.xml @@ -119,7 +119,7 @@ under the License. 2.11.12 2.11 0.7.6 - 3.5.6 + 3.4.14 2.12.0 2.12.1 From df0bc14e03b60156ec062cb55884b7385ed78cd1 Mon Sep 17 00:00:00 2001 From: ravimagham Date: Wed, 22 Sep 2021 01:14:15 +0200 Subject: [PATCH 21/27] [FLINK-24347][connectors/kafka] Keep idle source readers if parallelism is higher than partitions in KafkaSource Before this commit the enumerator signalled the leftover source readers without a partition to finish. This caused that checkpointing was not possible anymore because it is only supported if all tasks are running or FLIP-147 is enabled. This closes #17330 --- .../connector/kafka/source/KafkaSource.java | 4 +- .../enumerator/KafkaSourceEnumerator.java | 10 ++++- .../enumerator/KafkaEnumeratorTest.java | 2 + .../kafka/KafkaConsumerTestBase.java | 4 +- .../testutils/ValidatingExactlyOnceSink.java | 38 ++++++++++++++----- 5 files changed, 44 insertions(+), 14 deletions(-) diff --git a/flink-connectors/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/KafkaSource.java b/flink-connectors/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/KafkaSource.java index 35fd9544825cb..5904f90c7db56 100644 --- a/flink-connectors/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/KafkaSource.java +++ b/flink-connectors/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/KafkaSource.java @@ -163,7 +163,8 @@ public SplitEnumerator createEnumerat startingOffsetsInitializer, stoppingOffsetsInitializer, props, - enumContext); + enumContext, + boundedness); } @Override @@ -177,6 +178,7 @@ public SplitEnumerator restoreEnumera stoppingOffsetsInitializer, props, enumContext, + boundedness, checkpoint.assignedPartitions()); } diff --git a/flink-connectors/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/KafkaSourceEnumerator.java b/flink-connectors/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/KafkaSourceEnumerator.java index 9c69f3836cd63..1af52c2808d17 100644 --- a/flink-connectors/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/KafkaSourceEnumerator.java +++ b/flink-connectors/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/KafkaSourceEnumerator.java @@ -20,6 +20,7 @@ import org.apache.flink.annotation.Internal; import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.connector.source.Boundedness; import org.apache.flink.api.connector.source.SplitEnumerator; import org.apache.flink.api.connector.source.SplitEnumeratorContext; import org.apache.flink.api.connector.source.SplitsAssignment; @@ -64,6 +65,7 @@ public class KafkaSourceEnumerator private final Properties properties; private final long partitionDiscoveryIntervalMs; private final SplitEnumeratorContext context; + private final Boundedness boundedness; // The internal states of the enumerator. /** @@ -97,13 +99,15 @@ public KafkaSourceEnumerator( OffsetsInitializer startingOffsetInitializer, OffsetsInitializer stoppingOffsetInitializer, Properties properties, - SplitEnumeratorContext context) { + SplitEnumeratorContext context, + Boundedness boundedness) { this( subscriber, startingOffsetInitializer, stoppingOffsetInitializer, properties, context, + boundedness, Collections.emptySet()); } @@ -113,12 +117,14 @@ public KafkaSourceEnumerator( OffsetsInitializer stoppingOffsetInitializer, Properties properties, SplitEnumeratorContext context, + Boundedness boundedness, Set assignedPartitions) { this.subscriber = subscriber; this.startingOffsetInitializer = startingOffsetInitializer; this.stoppingOffsetInitializer = stoppingOffsetInitializer; this.properties = properties; this.context = context; + this.boundedness = boundedness; this.discoveredPartitions = new HashSet<>(); this.assignedPartitions = new HashSet<>(assignedPartitions); @@ -296,7 +302,7 @@ private void assignPendingPartitionSplits(Set pendingReaders) { // If periodically partition discovery is disabled and the initializing discovery has done, // signal NoMoreSplitsEvent to pending readers - if (noMoreNewPartitionSplits) { + if (noMoreNewPartitionSplits && boundedness == Boundedness.BOUNDED) { LOG.debug( "No more KafkaPartitionSplits to assign. Sending NoMoreSplitsEvent to reader {}" + " in consumer group {}.", diff --git a/flink-connectors/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/enumerator/KafkaEnumeratorTest.java b/flink-connectors/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/enumerator/KafkaEnumeratorTest.java index a46ac2d064180..edb4c816d7019 100644 --- a/flink-connectors/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/enumerator/KafkaEnumeratorTest.java +++ b/flink-connectors/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/enumerator/KafkaEnumeratorTest.java @@ -18,6 +18,7 @@ package org.apache.flink.connector.kafka.source.enumerator; +import org.apache.flink.api.connector.source.Boundedness; import org.apache.flink.api.connector.source.ReaderInfo; import org.apache.flink.api.connector.source.mocks.MockSplitEnumeratorContext; import org.apache.flink.connector.kafka.source.KafkaSourceOptions; @@ -432,6 +433,7 @@ private KafkaSourceEnumerator createEnumerator( stoppingOffsetsInitializer, props, enumContext, + Boundedness.CONTINUOUS_UNBOUNDED, assignedPartitions); } diff --git a/flink-connectors/flink-connector-kafka/src/test/java/org/apache/flink/streaming/connectors/kafka/KafkaConsumerTestBase.java b/flink-connectors/flink-connector-kafka/src/test/java/org/apache/flink/streaming/connectors/kafka/KafkaConsumerTestBase.java index 93e0cfef96b5b..a76534f8f5021 100644 --- a/flink-connectors/flink-connector-kafka/src/test/java/org/apache/flink/streaming/connectors/kafka/KafkaConsumerTestBase.java +++ b/flink-connectors/flink-connector-kafka/src/test/java/org/apache/flink/streaming/connectors/kafka/KafkaConsumerTestBase.java @@ -1104,8 +1104,10 @@ public void runMultipleSourcesOnePartitionExactlyOnceTest() throws Exception { getStream(env, topic, schema, props) .map(new PartitionValidatingMapper(numPartitions, 1)) + // Job only fails after a checkpoint is taken and the necessary number of elements + // is seen .map(new FailingIdentityMapper(failAfterElements)) - .addSink(new ValidatingExactlyOnceSink(totalElements)) + .addSink(new ValidatingExactlyOnceSink(totalElements, true)) .setParallelism(1); FailingIdentityMapper.failedBefore = false; diff --git a/flink-connectors/flink-connector-kafka/src/test/java/org/apache/flink/streaming/connectors/kafka/testutils/ValidatingExactlyOnceSink.java b/flink-connectors/flink-connector-kafka/src/test/java/org/apache/flink/streaming/connectors/kafka/testutils/ValidatingExactlyOnceSink.java index 41a2d86c69338..f00be219c5f2f 100644 --- a/flink-connectors/flink-connector-kafka/src/test/java/org/apache/flink/streaming/connectors/kafka/testutils/ValidatingExactlyOnceSink.java +++ b/flink-connectors/flink-connector-kafka/src/test/java/org/apache/flink/streaming/connectors/kafka/testutils/ValidatingExactlyOnceSink.java @@ -18,6 +18,7 @@ package org.apache.flink.streaming.connectors.kafka.testutils; +import org.apache.flink.api.common.state.CheckpointListener; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.streaming.api.checkpoint.ListCheckpointed; import org.apache.flink.streaming.api.functions.sink.RichSinkFunction; @@ -32,20 +33,26 @@ /** A {@link RichSinkFunction} that verifies that no duplicate records are generated. */ public class ValidatingExactlyOnceSink extends RichSinkFunction - implements ListCheckpointed> { + implements ListCheckpointed>, CheckpointListener { private static final Logger LOG = LoggerFactory.getLogger(ValidatingExactlyOnceSink.class); private static final long serialVersionUID = 1748426382527469932L; private final int numElementsTotal; + private final boolean waitForFinalCheckpoint; private BitSet duplicateChecker = new BitSet(); // this is checkpointed private int numElements; // this is checkpointed public ValidatingExactlyOnceSink(int numElementsTotal) { + this(numElementsTotal, false); + } + + public ValidatingExactlyOnceSink(int numElementsTotal, boolean waitForFinalCheckpoint) { this.numElementsTotal = numElementsTotal; + this.waitForFinalCheckpoint = waitForFinalCheckpoint; } @Override @@ -56,15 +63,8 @@ public void invoke(Integer value) throws Exception { throw new Exception("Received a duplicate: " + value); } duplicateChecker.set(value); - if (numElements == numElementsTotal) { - // validate - if (duplicateChecker.cardinality() != numElementsTotal) { - throw new Exception("Duplicate checker has wrong cardinality"); - } else if (duplicateChecker.nextClearBit(0) != numElementsTotal) { - throw new Exception("Received sparse sequence"); - } else { - throw new SuccessException(); - } + if (!waitForFinalCheckpoint) { + checkFinish(); } } @@ -87,4 +87,22 @@ public void restoreState(List> state) throws Exception { this.numElements = s.f0; this.duplicateChecker = s.f1; } + + @Override + public void notifyCheckpointComplete(long checkpointId) throws Exception { + checkFinish(); + } + + private void checkFinish() throws Exception { + if (numElements == numElementsTotal) { + // validate + if (duplicateChecker.cardinality() != numElementsTotal) { + throw new Exception("Duplicate checker has wrong cardinality"); + } else if (duplicateChecker.nextClearBit(0) != numElementsTotal) { + throw new Exception("Received sparse sequence"); + } else { + throw new SuccessException(); + } + } + } } From 98a866d1a663cf366d15ac4ca6c6233c85573e19 Mon Sep 17 00:00:00 2001 From: Konstantin Gizdarski Date: Tue, 9 Aug 2022 17:03:04 -0700 Subject: [PATCH 22/27] apache flink libraries version --- flink-python/setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flink-python/setup.py b/flink-python/setup.py index b2aad05fd4288..3e523da8b4531 100644 --- a/flink-python/setup.py +++ b/flink-python/setup.py @@ -180,6 +180,7 @@ def extracted_output_files(base_dir, file_path, output_directory): sys.exit(-1) VERSION = __version__ # noqa APACHE_FLINK_VERSION = '1.13.0' +APACHE_FLINK_LIBRARIES_VERSION = '1.13+lyft202208091660085522' with io.open(os.path.join(this_directory, 'README.md'), 'r', encoding='utf-8') as f: @@ -254,7 +255,7 @@ def extracted_output_files(base_dir, file_path, output_directory): "is complete, or do this in the flink-python directory of the flink source " "directory.") sys.exit(-1) - apache_flink_libraries_dependency = 'apache-flink-libraries==%s' % APACHE_FLINK_VERSION + apache_flink_libraries_dependency = 'apache-flink-libraries==%s' % APACHE_FLINK_LIBRARIES_VERSION script_names = ["pyflink-shell.sh", "find-flink-home.sh"] scripts = [os.path.join(SCRIPTS_TEMP_PATH, script) for script in script_names] scripts.append("pyflink/find_flink_home.py") From 8733b7b031413421a556f09032b65a29c34520f0 Mon Sep 17 00:00:00 2001 From: Seth Saperstein <99828679+sethsaperstein-lyft@users.noreply.github.com> Date: Wed, 31 Aug 2022 20:45:34 -0700 Subject: [PATCH 23/27] [STRMHELP-197] Update Global Watermark When Idle (#61) * add getter for global watermark manager * DRY * DRY test * additional logging and feature flag * fix log * metric for isIdle * change noOp to updateLocalWatermark * remove unused import --- .../config/ConsumerConfigConstants.java | 5 ++ .../kinesis/internals/KinesisDataFetcher.java | 49 ++++++++++++++++--- .../util/JobManagerWatermarkTracker.java | 18 +++++++ .../kinesis/util/WatermarkTracker.java | 2 + .../kinesis/FlinkKinesisConsumerTest.java | 5 ++ .../util/JobManagerWatermarkTrackerTest.java | 3 ++ .../kinesis/util/WatermarkTrackerTest.java | 17 ++++++- 7 files changed, 89 insertions(+), 10 deletions(-) diff --git a/flink-connectors/flink-connector-kinesis/src/main/java/org/apache/flink/streaming/connectors/kinesis/config/ConsumerConfigConstants.java b/flink-connectors/flink-connector-kinesis/src/main/java/org/apache/flink/streaming/connectors/kinesis/config/ConsumerConfigConstants.java index 4ea0bd82af687..046818d891ad9 100644 --- a/flink-connectors/flink-connector-kinesis/src/main/java/org/apache/flink/streaming/connectors/kinesis/config/ConsumerConfigConstants.java +++ b/flink-connectors/flink-connector-kinesis/src/main/java/org/apache/flink/streaming/connectors/kinesis/config/ConsumerConfigConstants.java @@ -303,6 +303,9 @@ public enum EFORegistrationType { /** The maximum delta allowed for the reader to advance ahead of the shared global watermark. */ public static final String WATERMARK_LOOKAHEAD_MILLIS = "flink.watermark.lookahead.millis"; + /** Feature flag to update global watermark when idle. */ + public static final String WATERMARK_SYNC_GLOBAL = "flink.watermark.sync.global"; + /** * The maximum number of records that will be buffered before suspending consumption of a shard. */ @@ -403,6 +406,8 @@ public enum EFORegistrationType { public static final long DEFAULT_WATERMARK_SYNC_MILLIS = 30_000; + public static final boolean DEFAULT_WATERMARK_SYNC_GLOBAL = false; + public static final int DEFAULT_EFO_HTTP_CLIENT_MAX_CONURRENCY = 10_000; /** diff --git a/flink-connectors/flink-connector-kinesis/src/main/java/org/apache/flink/streaming/connectors/kinesis/internals/KinesisDataFetcher.java b/flink-connectors/flink-connector-kinesis/src/main/java/org/apache/flink/streaming/connectors/kinesis/internals/KinesisDataFetcher.java index 9aee3e47a5a4e..a4b51c8a1afa0 100644 --- a/flink-connectors/flink-connector-kinesis/src/main/java/org/apache/flink/streaming/connectors/kinesis/internals/KinesisDataFetcher.java +++ b/flink-connectors/flink-connector-kinesis/src/main/java/org/apache/flink/streaming/connectors/kinesis/internals/KinesisDataFetcher.java @@ -142,6 +142,9 @@ public class KinesisDataFetcher { /** The metric group that all metrics should be registered to. */ private final MetricGroup consumerMetricGroup; + /** The metric group for the individual subtask. */ + private final MetricGroup shardMetricsGroup; + // ------------------------------------------------------------------------ // Subtask-specific settings // ------------------------------------------------------------------------ @@ -415,7 +418,9 @@ protected KinesisDataFetcher( runtimeContext .getMetricGroup() .addGroup(KinesisConsumerMetricConstants.KINESIS_CONSUMER_METRICS_GROUP); - + this.shardMetricsGroup = + consumerMetricGroup.addGroup( + "subtaskId", String.valueOf(indexOfThisConsumerSubtask)); this.error = checkNotNull(error); this.subscribedShardsState = checkNotNull(subscribedShardsState); this.subscribedStreamsToLastDiscoveredShardIds = @@ -626,7 +631,19 @@ public void runFetcher() throws Exception { watermarkTracker.setUpdateTimeoutMillis( watermarkSyncMillis * 3); // synchronization latency watermarkTracker.open(runtimeContext); - new WatermarkSyncCallback(timerService, watermarkSyncMillis).start(); + boolean updateGlobalWatermarkForIdleSubtask = + Boolean.parseBoolean( + getConsumerConfiguration() + .getProperty( + ConsumerConfigConstants.WATERMARK_SYNC_GLOBAL, + Boolean.toString( + ConsumerConfigConstants + .DEFAULT_WATERMARK_SYNC_GLOBAL))); + new WatermarkSyncCallback( + timerService, + watermarkSyncMillis, + updateGlobalWatermarkForIdleSubtask) + .start(); // emit records ahead of watermark to offset synchronization latency long lookaheadMillis = Long.parseLong( @@ -1172,6 +1189,14 @@ protected void emitWatermark() { } } + LOG.debug( + "WatermarkEmitter subtask: {}, last watermark: {}, potential watermark: {}" + + ", potential next watermark: {}", + indexOfThisConsumerSubtask, + lastWatermark, + potentialWatermark, + potentialNextWatermark); + // advance watermark if possible (watermarks can only be ascending) if (potentialWatermark == Long.MAX_VALUE) { if (shardWatermarks.isEmpty() || shardIdleIntervalMillis > 0) { @@ -1193,6 +1218,7 @@ protected void emitWatermark() { isIdle = false; } nextWatermark = potentialNextWatermark; + shardMetricsGroup.gauge("isIdle", () -> isIdle ? 1 : 0); } } @@ -1239,17 +1265,20 @@ private class WatermarkSyncCallback implements ProcessingTimeCallback { private final ProcessingTimeService timerService; private final long interval; + private final boolean updateGlobalWatermarkForIdleSubtask; private long lastGlobalWatermark = Long.MIN_VALUE; private long propagatedLocalWatermark = Long.MIN_VALUE; private int stalledWatermarkIntervalCount = 0; private long lastLogged; - WatermarkSyncCallback(ProcessingTimeService timerService, long interval) { + WatermarkSyncCallback( + ProcessingTimeService timerService, + long interval, + boolean updateGlobalWatermarkForIdleSubtask) { this.timerService = checkNotNull(timerService); this.interval = interval; - MetricGroup shardMetricsGroup = - consumerMetricGroup.addGroup( - "subtaskId", String.valueOf(indexOfThisConsumerSubtask)); + this.updateGlobalWatermarkForIdleSubtask = updateGlobalWatermarkForIdleSubtask; + shardMetricsGroup.gauge("localWatermark", () -> nextWatermark); shardMetricsGroup.gauge("globalWatermark", () -> lastGlobalWatermark); } @@ -1263,11 +1292,13 @@ public void start() { public void onProcessingTime(long timestamp) { if (nextWatermark != Long.MIN_VALUE) { long globalWatermark = lastGlobalWatermark; - // TODO: refresh watermark while idle if (!(isIdle && nextWatermark == propagatedLocalWatermark)) { globalWatermark = watermarkTracker.updateWatermark(nextWatermark); propagatedLocalWatermark = nextWatermark; } else { + if (updateGlobalWatermarkForIdleSubtask) { + globalWatermark = watermarkTracker.getWatermark(); + } LOG.info( "WatermarkSyncCallback subtask: {} is idle", indexOfThisConsumerSubtask); @@ -1277,12 +1308,14 @@ public void onProcessingTime(long timestamp) { lastLogged = System.currentTimeMillis(); LOG.info( "WatermarkSyncCallback subtask: {} local watermark: {}" - + ", global watermark: {}, delta: {} timeouts: {}, emitter: {}", + + ", global watermark: {}, delta: {} timeouts: {}, idle: {}" + + ", emitter: {}", indexOfThisConsumerSubtask, nextWatermark, globalWatermark, nextWatermark - globalWatermark, watermarkTracker.getUpdateTimeoutCount(), + isIdle, recordEmitter.printInfo()); // Following is for debugging non-reproducible issue with stalled watermark diff --git a/flink-connectors/flink-connector-kinesis/src/main/java/org/apache/flink/streaming/connectors/kinesis/util/JobManagerWatermarkTracker.java b/flink-connectors/flink-connector-kinesis/src/main/java/org/apache/flink/streaming/connectors/kinesis/util/JobManagerWatermarkTracker.java index b4c78438dca00..066ece21d192e 100644 --- a/flink-connectors/flink-connector-kinesis/src/main/java/org/apache/flink/streaming/connectors/kinesis/util/JobManagerWatermarkTracker.java +++ b/flink-connectors/flink-connector-kinesis/src/main/java/org/apache/flink/streaming/connectors/kinesis/util/JobManagerWatermarkTracker.java @@ -58,6 +58,18 @@ public long updateWatermark(long localWatermark) { WatermarkUpdate update = new WatermarkUpdate(); update.id = getSubtaskId(); update.watermark = localWatermark; + return updateWatermark(update); + } + + @Override + public long getWatermark() { + WatermarkUpdate update = new WatermarkUpdate(); + update.id = getSubtaskId(); + update.updateLocalWatermark = false; + return updateWatermark(update); + } + + public long updateWatermark(WatermarkUpdate update) { try { byte[] resultBytes = aggregateManager.updateGlobalAggregate( @@ -92,6 +104,7 @@ public long getUpdateTimeoutCount() { protected static class WatermarkUpdate implements Serializable { protected long watermark = Long.MIN_VALUE; protected String id; + protected boolean updateLocalWatermark = true; } /** Watermark aggregation result. */ @@ -129,6 +142,11 @@ public Map add( } catch (Exception e) { throw new RuntimeException(e); } + // no op to get global watermark without updating it + if (!value.updateLocalWatermark) { + addCount--; + return accumulator; + } WatermarkState ws = accumulator.get(value.id); if (ws == null) { accumulator.put(value.id, ws = new WatermarkState()); diff --git a/flink-connectors/flink-connector-kinesis/src/main/java/org/apache/flink/streaming/connectors/kinesis/util/WatermarkTracker.java b/flink-connectors/flink-connector-kinesis/src/main/java/org/apache/flink/streaming/connectors/kinesis/util/WatermarkTracker.java index 954a8b1f86c88..cb18d766a17ef 100644 --- a/flink-connectors/flink-connector-kinesis/src/main/java/org/apache/flink/streaming/connectors/kinesis/util/WatermarkTracker.java +++ b/flink-connectors/flink-connector-kinesis/src/main/java/org/apache/flink/streaming/connectors/kinesis/util/WatermarkTracker.java @@ -95,6 +95,8 @@ public void setUpdateTimeoutMillis(long updateTimeoutMillis) { */ public abstract long updateWatermark(final long localWatermark); + public abstract long getWatermark(); + protected long getCurrentTime() { return System.currentTimeMillis(); } diff --git a/flink-connectors/flink-connector-kinesis/src/test/java/org/apache/flink/streaming/connectors/kinesis/FlinkKinesisConsumerTest.java b/flink-connectors/flink-connector-kinesis/src/test/java/org/apache/flink/streaming/connectors/kinesis/FlinkKinesisConsumerTest.java index d48e04ed4e69a..0a5e35dbb3aa4 100644 --- a/flink-connectors/flink-connector-kinesis/src/test/java/org/apache/flink/streaming/connectors/kinesis/FlinkKinesisConsumerTest.java +++ b/flink-connectors/flink-connector-kinesis/src/test/java/org/apache/flink/streaming/connectors/kinesis/FlinkKinesisConsumerTest.java @@ -1256,6 +1256,11 @@ public long updateWatermark(long localWatermark) { return localWatermark; } + @Override + public long getWatermark() { + return WATERMARK.get(); + } + static void assertGlobalWatermark(long expected) { Assert.assertEquals(expected, WATERMARK.get()); } diff --git a/flink-connectors/flink-connector-kinesis/src/test/java/org/apache/flink/streaming/connectors/kinesis/util/JobManagerWatermarkTrackerTest.java b/flink-connectors/flink-connector-kinesis/src/test/java/org/apache/flink/streaming/connectors/kinesis/util/JobManagerWatermarkTrackerTest.java index 6b1f8d06b67d3..43f9d23608f8a 100644 --- a/flink-connectors/flink-connector-kinesis/src/test/java/org/apache/flink/streaming/connectors/kinesis/util/JobManagerWatermarkTrackerTest.java +++ b/flink-connectors/flink-connector-kinesis/src/test/java/org/apache/flink/streaming/connectors/kinesis/util/JobManagerWatermarkTrackerTest.java @@ -93,6 +93,9 @@ public void open(Configuration parameters) throws Exception { public void run(SourceContext ctx) { Assert.assertEquals(998, tracker.updateWatermark(998)); Assert.assertEquals(999, tracker.updateWatermark(999)); + Assert.assertEquals(999, tracker.getWatermark()); + Assert.assertEquals(1000, tracker.updateWatermark(1000)); + Assert.assertEquals(1000, tracker.getWatermark()); } @Override diff --git a/flink-connectors/flink-connector-kinesis/src/test/java/org/apache/flink/streaming/connectors/kinesis/util/WatermarkTrackerTest.java b/flink-connectors/flink-connector-kinesis/src/test/java/org/apache/flink/streaming/connectors/kinesis/util/WatermarkTrackerTest.java index fd98d920b12a3..16e635aaa02ed 100644 --- a/flink-connectors/flink-connector-kinesis/src/test/java/org/apache/flink/streaming/connectors/kinesis/util/WatermarkTrackerTest.java +++ b/flink-connectors/flink-connector-kinesis/src/test/java/org/apache/flink/streaming/connectors/kinesis/util/WatermarkTrackerTest.java @@ -25,6 +25,7 @@ import java.util.HashMap; import java.util.Map; +import java.util.Optional; /** Test for {@link WatermarkTracker}. */ public class WatermarkTrackerTest { @@ -45,6 +46,15 @@ protected long getCurrentTime() { @Override public long updateWatermark(final long localWatermark) { + return updateWatermark(Optional.of(localWatermark)); + } + + @Override + public long getWatermark() { + return updateWatermark(Optional.empty()); + } + + private long updateWatermark(Optional localWatermark) { refreshWatermarkSnapshot(this.watermarks); long currentTime = getCurrentTime(); @@ -54,8 +64,11 @@ public long updateWatermark(final long localWatermark) { if (ws == null) { watermarks.put(subtaskId, ws = new WatermarkState()); } - ws.lastUpdated = currentTime; - ws.watermark = Math.max(ws.watermark, localWatermark); + // empty if getting without updating + if (localWatermark.isPresent()) { + ws.lastUpdated = currentTime; + ws.watermark = Math.max(ws.watermark, localWatermark.get()); + } saveWatermark(subtaskId, ws); long globalWatermark = ws.watermark; From aec7944f93a429d60b0acc3fbf57ded5066c4032 Mon Sep 17 00:00:00 2001 From: Seth Saperstein <99828679+sethsaperstein-lyft@users.noreply.github.com> Date: Wed, 21 Sep 2022 17:33:44 -0700 Subject: [PATCH 24/27] [STRMCMP-1477] Zookeeper 3.5 Upgrade (#65) * bump zk version * update to 3.5.10 * Commit for release 1.13-lyft202209211663788204 * try to make proper zk version available * Revert "Commit for release 1.13-lyft202209211663788204" This reverts commit c0fc93fe5927cd1493672156500960e291253ca2. * Commit for release 1.13-lyft202209211663790698 * Revert "Commit for release 1.13-lyft202209211663790698" This reverts commit 086ef31af8514c331ea2139442f058e0d417dc19. * change to flink 3.5.6 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 226abad21dd54..a0afdeab01b1c 100644 --- a/pom.xml +++ b/pom.xml @@ -119,7 +119,7 @@ under the License. 2.11.12 2.11 0.7.6 - 3.4.14 + 3.5.6 2.12.0 2.12.1 From dc7d5c2f98f243bd3fa92a9f4551666b47906148 Mon Sep 17 00:00:00 2001 From: Konstantin Gizdarski Date: Thu, 29 Sep 2022 20:19:25 -0700 Subject: [PATCH 25/27] Update setup.py --- flink-python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flink-python/setup.py b/flink-python/setup.py index 3e523da8b4531..239d1328c327b 100644 --- a/flink-python/setup.py +++ b/flink-python/setup.py @@ -180,7 +180,7 @@ def extracted_output_files(base_dir, file_path, output_directory): sys.exit(-1) VERSION = __version__ # noqa APACHE_FLINK_VERSION = '1.13.0' -APACHE_FLINK_LIBRARIES_VERSION = '1.13+lyft202208091660085522' +APACHE_FLINK_LIBRARIES_VERSION = '1.13+lyft202208111660263173' with io.open(os.path.join(this_directory, 'README.md'), 'r', encoding='utf-8') as f: From 2196926d20b5bd63f416355928bc4c76d0bfa205 Mon Sep 17 00:00:00 2001 From: Konstantin Gizdarski Date: Tue, 11 Oct 2022 10:58:04 -0700 Subject: [PATCH 26/27] unpin pickle --- flink-python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flink-python/setup.py b/flink-python/setup.py index 239d1328c327b..f359a09cf8722 100644 --- a/flink-python/setup.py +++ b/flink-python/setup.py @@ -304,7 +304,7 @@ def extracted_output_files(base_dir, file_path, output_directory): author_email='dev@flink.apache.org', python_requires='>3.6', install_requires=['py4j>=0.10.8.1,<=0.10.9.5', 'python-dateutil>=2.8.1', 'apache-beam==2.30.0+lyft202205161652748117', - 'cloudpickle==1.2.2', 'avro-python3>=1.8.1,!=1.9.2,<1.10.0', + 'cloudpickle>=1.2.2', 'avro-python3>=1.8.1,!=1.9.2,<1.10.0', 'pandas>=1.0,<1.2.0', 'pyarrow>=0.15.1,<=8.0.0', 'pytz>=2018.3', 'numpy>=1.14.3,<1.20', 'fastavro>=0.21.4,<0.24', apache_flink_libraries_dependency], From 869a2e96e351dc0cb060f7033090daf4b6bbdab8 Mon Sep 17 00:00:00 2001 From: Seth Saperstein Date: Thu, 13 Oct 2022 23:23:31 -0700 Subject: [PATCH 27/27] define gauge once --- .../connectors/kinesis/internals/KinesisDataFetcher.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flink-connectors/flink-connector-kinesis/src/main/java/org/apache/flink/streaming/connectors/kinesis/internals/KinesisDataFetcher.java b/flink-connectors/flink-connector-kinesis/src/main/java/org/apache/flink/streaming/connectors/kinesis/internals/KinesisDataFetcher.java index a4b51c8a1afa0..e59c140955a27 100644 --- a/flink-connectors/flink-connector-kinesis/src/main/java/org/apache/flink/streaming/connectors/kinesis/internals/KinesisDataFetcher.java +++ b/flink-connectors/flink-connector-kinesis/src/main/java/org/apache/flink/streaming/connectors/kinesis/internals/KinesisDataFetcher.java @@ -21,6 +21,7 @@ import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.functions.RuntimeContext; import org.apache.flink.api.common.serialization.RuntimeContextInitializationContextAdapters; +import org.apache.flink.metrics.Gauge; import org.apache.flink.metrics.MetricGroup; import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks; import org.apache.flink.streaming.api.functions.source.SourceFunction; @@ -421,6 +422,7 @@ protected KinesisDataFetcher( this.shardMetricsGroup = consumerMetricGroup.addGroup( "subtaskId", String.valueOf(indexOfThisConsumerSubtask)); + this.shardMetricsGroup.gauge("isIdle", () -> isIdle ? 1 : 0); this.error = checkNotNull(error); this.subscribedShardsState = checkNotNull(subscribedShardsState); this.subscribedStreamsToLastDiscoveredShardIds = @@ -1218,7 +1220,6 @@ protected void emitWatermark() { isIdle = false; } nextWatermark = potentialNextWatermark; - shardMetricsGroup.gauge("isIdle", () -> isIdle ? 1 : 0); } }