diff --git a/sdks/python/apache_beam/transforms/util.py b/sdks/python/apache_beam/transforms/util.py index 1d7572bebaf1..14ed2ad559bc 100644 --- a/sdks/python/apache_beam/transforms/util.py +++ b/sdks/python/apache_beam/transforms/util.py @@ -111,24 +111,28 @@ class CoGroupByKey(PTransform): 'tag2': ... , ... }) + where `[]` refers to an iterable, not a list. + For example, given:: {'tag1': pc1, 'tag2': pc2, 333: pc3} where:: - pc1 = [(k1, v1)] - pc2 = [] - pc3 = [(k1, v31), (k1, v32), (k2, v33)] + pc1 = beam.Create([(k1, v1)])) + pc2 = beam.Create([]) + pc3 = beam.Create([(k1, v31), (k1, v32), (k2, v33)]) - The output PCollection would be:: + The output PCollection would consist of items:: [(k1, {'tag1': [v1], 'tag2': [], 333: [v31, v32]}), (k2, {'tag1': [], 'tag2': [], 333: [v33]})] + where `[]` refers to an iterable, not a list. + CoGroupByKey also works for tuples, lists, or other flat iterables of PCollections, in which case the values of the resulting PCollections - will be tuples whose nth value is the list of values from the nth + will be tuples whose nth value is the iterable of values from the nth PCollection---conceptually, the "tags" are the indices into the input. Thus, for this input:: @@ -139,6 +143,8 @@ class CoGroupByKey(PTransform): [(k1, ([v1], [], [v31, v32]), (k2, ([], [], [v33]))] + where, again, `[]` refers to an iterable, not a list. + Attributes: **kwargs: Accepts a single named argument "pipeline", which specifies the pipeline that "owns" this PTransform. Ordinarily CoGroupByKey can obtain