-
Notifications
You must be signed in to change notification settings - Fork 4.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
* Provide WatermarkEstimator to track watermark * Merged conflicts and resolved conflicts * Change if to if not None * Changed deferred_watermark to deferred_timestamp * Add NoOpWatermarkEstimator * Plump NoOpWatermarkEstimatorProvider through bundle_processor. * Changes to try_split in common.py * fix lint * Fix common.pxd * Fix formatter * Fix formatter again * Clean up if branch for watermark_estimator * Add default_provider() method to watermark_estimators * Use separate locks for ThreadsafeWatermarkEstimator and ThreadsafeRestrictionTracker * Fix formatter again * Add TODO to ManualWatermarkEstimator * Fix Frometter * Fix lint * More comments * Minor performance update for critical piece of code. Co-authored-by: Robert Bradshaw <robertwb@gmail.com>
- Loading branch information
Showing
12 changed files
with
563 additions
and
200 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
# | ||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
"""A collection of WatermarkEstimator implementations that SplittableDoFns | ||
can use.""" | ||
|
||
# pytype: skip-file | ||
|
||
from __future__ import absolute_import | ||
|
||
from apache_beam.io.iobase import WatermarkEstimator | ||
from apache_beam.transforms.core import WatermarkEstimatorProvider | ||
from apache_beam.utils.timestamp import Timestamp | ||
|
||
|
||
class MonotonicWatermarkEstimator(WatermarkEstimator): | ||
"""A WatermarkEstimator which assumes that timestamps of all ouput records | ||
are increasing monotonically. | ||
""" | ||
def __init__(self, timestamp): | ||
"""For a new <element, restriction> pair, the initial value is None. When | ||
resuming processing, the initial timestamp will be the last reported | ||
watermark. | ||
""" | ||
self._watermark = timestamp | ||
|
||
def observe_timestamp(self, timestamp): | ||
if self._watermark is None: | ||
self._watermark = timestamp | ||
else: | ||
# TODO(BEAM-9312): Consider making it configurable to deal with late | ||
# timestamp. | ||
if timestamp < self._watermark: | ||
raise ValueError( | ||
'A MonotonicWatermarkEstimator expects output ' | ||
'timestamp to be increasing monotonically.') | ||
self._watermark = timestamp | ||
|
||
def current_watermark(self): | ||
return self._watermark | ||
|
||
def get_estimator_state(self): | ||
return self._watermark | ||
|
||
@staticmethod | ||
def default_provider(): | ||
"""Provide a default WatermarkEstimatorProvider for | ||
MonotonicWatermarkEstimator. | ||
""" | ||
class DefaultMonotonicWatermarkEstimator(WatermarkEstimatorProvider): | ||
def initial_estimator_state(self, element, restriction): | ||
return None | ||
|
||
def create_watermark_estimator(self, estimator_state): | ||
return MonotonicWatermarkEstimator(estimator_state) | ||
|
||
return DefaultMonotonicWatermarkEstimator() | ||
|
||
|
||
class WalltimeWatermarkEstimator(WatermarkEstimator): | ||
"""A WatermarkEstimator which uses processing time as the estimated watermark. | ||
""" | ||
def __init__(self, timestamp=None): | ||
self._timestamp = timestamp or Timestamp.now() | ||
|
||
def observe_timestamp(self, timestamp): | ||
pass | ||
|
||
def current_watermark(self): | ||
self._timestamp = max(self._timestamp, Timestamp.now()) | ||
return self._timestamp | ||
|
||
def get_estimator_state(self): | ||
return self._timestamp | ||
|
||
@staticmethod | ||
def default_provider(): | ||
"""Provide a default WatermarkEstimatorProvider for | ||
WalltimeWatermarkEstimator. | ||
""" | ||
class DefaultWalltimeWatermarkEstimator(WatermarkEstimatorProvider): | ||
def initial_estimator_state(self, element, restriction): | ||
return None | ||
|
||
def create_watermark_estimator(self, estimator_state): | ||
return WalltimeWatermarkEstimator(estimator_state) | ||
|
||
return DefaultWalltimeWatermarkEstimator() | ||
|
||
|
||
class ManualWatermarkEstimator(WatermarkEstimator): | ||
"""A WatermarkEstimator which is controlled manually from within a DoFn. | ||
The DoFn must invoke set_watermark to advance the watermark. | ||
""" | ||
def __init__(self, watermark): | ||
self._watermark = watermark | ||
|
||
def observe_timestamp(self, timestamp): | ||
pass | ||
|
||
def current_watermark(self): | ||
return self._watermark | ||
|
||
def get_estimator_state(self): | ||
return self._watermark | ||
|
||
def set_watermark(self, timestamp): | ||
# Please call set_watermark after calling restriction_tracker.try_claim() to | ||
# prevent advancing watermark early. | ||
# TODO(BEAM-7473): It's possible that getting a slightly stale watermark | ||
# when performing split. | ||
if not isinstance(timestamp, Timestamp): | ||
raise ValueError('set_watermark expects a Timestamp as input') | ||
if self._watermark and self._watermark > timestamp: | ||
raise ValueError( | ||
'Watermark must be monotonically increasing.' | ||
'Provided watermark %s is less than ' | ||
'current watermark %s', | ||
timestamp, | ||
self._watermark) | ||
self._watermark = timestamp | ||
|
||
@staticmethod | ||
def default_provider(): | ||
"""Provide a default WatermarkEstimatorProvider for | ||
WalltimeWatermarkEstimator. | ||
""" | ||
class DefaultManualWatermarkEstimatorProvider(WatermarkEstimatorProvider): | ||
def initial_estimator_state(self, element, restriction): | ||
return None | ||
|
||
def create_watermark_estimator(self, estimator_state): | ||
return ManualWatermarkEstimator(estimator_state) | ||
|
||
return DefaultManualWatermarkEstimatorProvider() |
106 changes: 106 additions & 0 deletions
106
sdks/python/apache_beam/io/watermark_estimators_test.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
# | ||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
"""Unit tests for built-in WatermarkEstimators""" | ||
|
||
# pytype: skip-file | ||
|
||
from __future__ import absolute_import | ||
|
||
import unittest | ||
|
||
import mock | ||
|
||
from apache_beam.io.iobase import WatermarkEstimator | ||
from apache_beam.io.watermark_estimators import ManualWatermarkEstimator | ||
from apache_beam.io.watermark_estimators import MonotonicWatermarkEstimator | ||
from apache_beam.io.watermark_estimators import WalltimeWatermarkEstimator | ||
from apache_beam.utils.timestamp import Duration | ||
from apache_beam.utils.timestamp import Timestamp | ||
|
||
|
||
class MonotonicWatermarkEstimatorTest(unittest.TestCase): | ||
def test_initialize_from_state(self): | ||
timestamp = Timestamp(10) | ||
watermark_estimator = MonotonicWatermarkEstimator(timestamp) | ||
self.assertIsInstance(watermark_estimator, WatermarkEstimator) | ||
self.assertEqual(watermark_estimator.get_estimator_state(), timestamp) | ||
|
||
def test_observe_timestamp(self): | ||
watermark_estimator = MonotonicWatermarkEstimator(Timestamp(10)) | ||
watermark_estimator.observe_timestamp(Timestamp(15)) | ||
self.assertEqual(watermark_estimator.current_watermark(), Timestamp(15)) | ||
watermark_estimator.observe_timestamp(Timestamp(20)) | ||
self.assertEqual(watermark_estimator.current_watermark(), Timestamp(20)) | ||
watermark_estimator.observe_timestamp(Timestamp(20)) | ||
self.assertEqual(watermark_estimator.current_watermark(), Timestamp(20)) | ||
with self.assertRaises(ValueError): | ||
watermark_estimator.observe_timestamp(Timestamp(10)) | ||
|
||
def test_get_estimator_state(self): | ||
watermark_estimator = MonotonicWatermarkEstimator(Timestamp(10)) | ||
watermark_estimator.observe_timestamp(Timestamp(15)) | ||
self.assertEqual(watermark_estimator.get_estimator_state(), Timestamp(15)) | ||
|
||
|
||
class WalltimeWatermarkEstimatorTest(unittest.TestCase): | ||
@mock.patch('apache_beam.utils.timestamp.Timestamp.now') | ||
def test_initialization(self, mock_timestamp): | ||
now_time = Timestamp.now() - Duration(10) | ||
mock_timestamp.side_effect = lambda: now_time | ||
watermark_estimator = WalltimeWatermarkEstimator() | ||
self.assertIsInstance(watermark_estimator, WatermarkEstimator) | ||
self.assertEqual(watermark_estimator.get_estimator_state(), now_time) | ||
|
||
def test_observe_timestamp(self): | ||
now_time = Timestamp.now() + Duration(10) | ||
watermark_estimator = WalltimeWatermarkEstimator(now_time) | ||
watermark_estimator.observe_timestamp(Timestamp(10)) | ||
watermark_estimator.observe_timestamp(Timestamp(10)) | ||
self.assertEqual(watermark_estimator.current_watermark(), now_time) | ||
|
||
def test_advance_watermark_with_incorrect_sys_clock(self): | ||
initial_timestamp = Timestamp.now() + Duration(100) | ||
watermark_estimator = WalltimeWatermarkEstimator(initial_timestamp) | ||
self.assertEqual(watermark_estimator.current_watermark(), initial_timestamp) | ||
self.assertEqual( | ||
watermark_estimator.get_estimator_state(), initial_timestamp) | ||
|
||
|
||
class ManualWatermarkEstimatorTest(unittest.TestCase): | ||
def test_initialization(self): | ||
watermark_estimator = ManualWatermarkEstimator(None) | ||
self.assertIsNone(watermark_estimator.get_estimator_state()) | ||
self.assertIsNone(watermark_estimator.current_watermark()) | ||
watermark_estimator = ManualWatermarkEstimator(Timestamp(10)) | ||
self.assertEqual(watermark_estimator.get_estimator_state(), Timestamp(10)) | ||
|
||
def test_set_watermark(self): | ||
watermark_estimator = ManualWatermarkEstimator(None) | ||
self.assertIsNone(watermark_estimator.current_watermark()) | ||
watermark_estimator.observe_timestamp(Timestamp(10)) | ||
self.assertIsNone(watermark_estimator.current_watermark()) | ||
watermark_estimator.set_watermark(Timestamp(20)) | ||
self.assertEqual(watermark_estimator.current_watermark(), Timestamp(20)) | ||
watermark_estimator.set_watermark(Timestamp(30)) | ||
self.assertEqual(watermark_estimator.current_watermark(), Timestamp(30)) | ||
with self.assertRaises(ValueError): | ||
watermark_estimator.set_watermark(Timestamp(25)) | ||
|
||
|
||
if __name__ == '__main__': | ||
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.