-
Notifications
You must be signed in to change notification settings - Fork 4.4k
[BEAM-4006] Futurize transforms subpackage #5729
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a8618f1
924c447
6b80c40
0092d4a
94cfda0
d848091
a224c58
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,12 +21,15 @@ | |
|
|
||
| import copy | ||
| import inspect | ||
| import itertools | ||
| import random | ||
| import re | ||
| import types | ||
| from builtins import map | ||
| from builtins import object | ||
| from builtins import range | ||
|
|
||
| from six import string_types | ||
| from future.builtins import filter | ||
| from past.builtins import unicode | ||
|
|
||
| from apache_beam import coders | ||
| from apache_beam import pvalue | ||
|
|
@@ -82,7 +85,6 @@ | |
| 'Impulse', | ||
| ] | ||
|
|
||
|
|
||
| # Type variables | ||
| T = typehints.TypeVariable('T') | ||
| K = typehints.TypeVariable('K') | ||
|
|
@@ -291,6 +293,9 @@ def __eq__(self, other): | |
| return self.param_id == other.param_id | ||
| return False | ||
|
|
||
| def __hash__(self): | ||
| return hash(self.param_id) | ||
|
|
||
| def __repr__(self): | ||
| return self.param_id | ||
|
|
||
|
|
@@ -698,7 +703,7 @@ def merge_accumulators(self, accumulators, *args, **kwargs): | |
|
|
||
| class ReiterableNonEmptyAccumulators(object): | ||
| def __iter__(self): | ||
| return itertools.ifilter(filter_fn, accumulators) | ||
| return filter(filter_fn, accumulators) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this might be a potential source for the performance loss --> I'll update this to use ifilter on PY2
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @charlesccychen and @tvalentyn: is there more detailed info on the benchmarks?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you. Let me test the pipeline with this change. Unfortunately it's not easy to export the benchmark data. |
||
|
|
||
| # It's (weakly) assumed that self._fn is associative. | ||
| return self._fn(ReiterableNonEmptyAccumulators(), *args, **kwargs) | ||
|
|
@@ -902,7 +907,8 @@ def with_outputs(self, *tags, **main_kw): | |
| """ | ||
| main_tag = main_kw.pop('main', None) | ||
| if main_kw: | ||
| raise ValueError('Unexpected keyword arguments: %s' % main_kw.keys()) | ||
| raise ValueError('Unexpected keyword arguments: %s' % | ||
| list(main_kw)) | ||
| return _MultiParDo(self, tags, main_tag) | ||
|
|
||
| def _pardo_fn_data(self): | ||
|
|
@@ -1666,7 +1672,6 @@ def expand(self, pcoll): | |
|
|
||
|
|
||
| class Windowing(object): | ||
|
|
||
| def __init__(self, windowfn, triggerfn=None, accumulation_mode=None, | ||
| timestamp_combiner=None): | ||
| global AccumulationMode, DefaultTrigger # pylint: disable=global-variable-not-assigned | ||
|
|
@@ -1712,6 +1717,10 @@ def __eq__(self, other): | |
| and self.timestamp_combiner == other.timestamp_combiner) | ||
| return False | ||
|
|
||
| def __hash__(self): | ||
| return hash((self.windowfn, self.accumulation_mode, | ||
| self.timestamp_combiner)) | ||
|
|
||
| def is_default(self): | ||
| return self._is_default | ||
|
|
||
|
|
@@ -1792,7 +1801,7 @@ def __init__(self, windowfn, **kwargs): | |
| accumulation_mode = kwargs.pop('accumulation_mode', None) | ||
| timestamp_combiner = kwargs.pop('timestamp_combiner', None) | ||
| if kwargs: | ||
| raise ValueError('Unexpected keyword arguments: %s' % kwargs.keys()) | ||
| raise ValueError('Unexpected keyword arguments: %s' % list(kwargs)) | ||
| self.windowing = Windowing( | ||
| windowfn, triggerfn, accumulation_mode, timestamp_combiner) | ||
| super(WindowInto, self).__init__(self.WindowIntoFn(self.windowing)) | ||
|
|
@@ -1861,7 +1870,7 @@ def __init__(self, **kwargs): | |
| super(Flatten, self).__init__() | ||
| self.pipeline = kwargs.pop('pipeline', None) | ||
| if kwargs: | ||
| raise ValueError('Unexpected keyword arguments: %s' % kwargs.keys()) | ||
| raise ValueError('Unexpected keyword arguments: %s' % list(kwargs)) | ||
|
|
||
| def _extract_input_pvalues(self, pvalueish): | ||
| try: | ||
|
|
@@ -1906,7 +1915,7 @@ def __init__(self, value): | |
| value: An object of values for the PCollection | ||
| """ | ||
| super(Create, self).__init__() | ||
| if isinstance(value, string_types): | ||
| if isinstance(value, (unicode, str, bytes)): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No need to check for both
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See #5729 (comment). Bytes in Python3 also shouldn't be allowed since we don't want to support creation of a PCollection of single bytes. |
||
| raise TypeError('PTransform Create: Refusing to treat string as ' | ||
| 'an iterable. (string=%r)' % value) | ||
| elif isinstance(value, dict): | ||
|
|
@@ -1941,7 +1950,7 @@ def get_windowing(self, unused_inputs): | |
|
|
||
| @staticmethod | ||
| def _create_source_from_iterable(values, coder): | ||
| return Create._create_source(map(coder.encode, values), coder) | ||
| return Create._create_source(list(map(coder.encode, values)), coder) | ||
|
|
||
| @staticmethod | ||
| def _create_source(serialized_values, coder): | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,6 +15,13 @@ | |
| # limitations under the License. | ||
| # | ||
|
|
||
| from __future__ import absolute_import | ||
| from __future__ import division | ||
|
|
||
| from builtins import map | ||
| from builtins import next | ||
| from builtins import range | ||
|
|
||
| from apache_beam.io import iobase | ||
| from apache_beam.transforms.core import Create | ||
|
|
||
|
|
@@ -57,15 +64,15 @@ def split(self, desired_bundle_size, start_position=None, | |
| start_position = 0 | ||
| if stop_position is None: | ||
| stop_position = len(self._serialized_values) | ||
| avg_size_per_value = self._total_size / len(self._serialized_values) | ||
| avg_size_per_value = self._total_size // len(self._serialized_values) | ||
| num_values_per_split = max( | ||
| int(desired_bundle_size / avg_size_per_value), 1) | ||
| int(desired_bundle_size // avg_size_per_value), 1) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No need for an int call?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We still need to coerce it into an
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. I didn't realise that |
||
| start = start_position | ||
| while start < stop_position: | ||
| end = min(start + num_values_per_split, stop_position) | ||
| remaining = stop_position - end | ||
| # Avoid having a too small bundle at the end. | ||
| if remaining < (num_values_per_split / 4): | ||
| if remaining < (num_values_per_split // 4): | ||
| end = stop_position | ||
| sub_source = Create._create_source( | ||
| self._serialized_values[start:end], self._coder) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,12 +15,17 @@ | |
| # limitations under the License. | ||
| # | ||
|
|
||
| # cython: language_level=3 | ||
|
|
||
| """A library of basic cythonized CombineFn subclasses. | ||
|
|
||
| For internal use only; no backwards-compatibility guarantees. | ||
| """ | ||
|
|
||
| from __future__ import absolute_import | ||
| from __future__ import division | ||
|
|
||
| from builtins import object | ||
|
|
||
| from apache_beam.transforms import core | ||
|
|
||
|
|
@@ -162,7 +167,7 @@ def extract_output(self): | |
| self.sum %= 2**64 | ||
| if self.sum >= INT64_MAX: | ||
| self.sum -= 2**64 | ||
| return self.sum / self.count if self.count else _NAN | ||
| return self.sum // self.count if self.count else _NAN | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please also make the change in line 266. |
||
|
|
||
|
|
||
| class CountCombineFn(AccumulatorCombineFn): | ||
|
|
@@ -258,7 +263,7 @@ def merge(self, accumulators): | |
| self.count += accumulator.count | ||
|
|
||
| def extract_output(self): | ||
| return self.sum / self.count if self.count else _NAN | ||
| return self.sum // self.count if self.count else _NAN | ||
|
|
||
|
|
||
| class SumFloatFn(AccumulatorCombineFn): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is fine, but was there a particular reason it was added?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is because of the import
from builtins import objectin apache_beam/transforms/display.py.This import adds an alias:
next = __next__for Python2 and Python3 compatibility.PipelineOptions (the tested class in this test) inherits from HasDisplayData class defined in the display.py module.