In [1]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-24.3.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-24.3.1


In [3]:
!pip install apache_beam==2.60.0



In [4]:
import apache_beam as beam
import time

## **Latest:**

* 	Gets the element with the latest timestamp.
*  we create a pipeline with a PCollection of produce with a timestamp for their harvest date. We use Latest to get the element with the latest timestamp from the PCollection.


In [6]:
def to_unix_time(time_str, format='%Y-%m-%d %H:%M:%S'):
  return time.mktime(time.strptime(time_str, format))

with beam.Pipeline() as pipeline:
  latest_element = (
      pipeline
      | 'Create crops' >> beam.Create([
          {
              'item': '🥬', 'harvest': '2020-02-24 00:00:00'
          },
          {
              'item': '🍓', 'harvest': '2020-06-16 00:00:00'
          },
          {
              'item': '🥕', 'harvest': '2020-07-17 00:00:00'
          },
          {
              'item': '🍆', 'harvest': '2020-10-26 00:00:00'
          },
          {
              'item': '🍅', 'harvest': '2020-10-01 00:00:00'
          },
      ])
      | 'With timestamps' >> beam.Map(
          lambda crop: beam.window.TimestampedValue(
              crop['item'], to_unix_time(crop['harvest'])))
      | 'Get latest element' >> beam.combiners.Latest.Globally()
      | beam.Map(print))

🍆


In [7]:
def to_unix_time(time_str, format='%Y-%m-%d %H:%M:%S'):
  return time.mktime(time.strptime(time_str, format))

with beam.Pipeline() as pipeline:
  latest_elements_per_key = (
      pipeline
      | 'Create crops' >> beam.Create([
          ('spring', {
              'item': '🥕', 'harvest': '2020-06-28 00:00:00'
          }),
          ('spring', {
              'item': '🍓', 'harvest': '2020-06-16 00:00:00'
          }),
          ('summer', {
              'item': '🥕', 'harvest': '2020-07-17 00:00:00'
          }),
          ('summer', {
              'item': '🍓', 'harvest': '2020-08-26 00:00:00'
          }),
          ('summer', {
              'item': '🍆', 'harvest': '2020-09-04 00:00:00'
          }),
          ('summer', {
              'item': '🥬', 'harvest': '2020-09-18 00:00:00'
          }),
          ('summer', {
              'item': '🍅', 'harvest': '2020-09-22 00:00:00'
          }),
          ('autumn', {
              'item': '🍅', 'harvest': '2020-10-01 00:00:00'
          }),
          ('autumn', {
              'item': '🥬', 'harvest': '2020-10-20 00:00:00'
          }),
          ('autumn', {
              'item': '🍆', 'harvest': '2020-10-26 00:00:00'
          }),
          ('winter', {
              'item': '🥬', 'harvest': '2020-02-24 00:00:00'
          }),
      ])
      | 'With timestamps' >> beam.Map(
          lambda pair: beam.window.TimestampedValue(
              (pair[0], pair[1]['item']), to_unix_time(pair[1]['harvest'])))
      | 'Get latest elements per key' >> beam.combiners.Latest.PerKey()
      | beam.Map(print))

('spring', '🥕')
('summer', '🍅')
('autumn', '🍆')
('winter', '🥬')


## **Max:**

*	Gets the element with the maximum value within each aggregation.
* we create a pipeline with a PCollection. Then, we get the element with the maximum value in different ways.




In [8]:
with beam.Pipeline() as pipeline:
  max_element = (
      pipeline
      | 'Create numbers' >> beam.Create([3, 4, 1, 2])
      | 'Get max value' >>
      beam.CombineGlobally(lambda elements: max(elements or [None]))
      | beam.Map(print))

4


Combine.PerKey() to get the maximum element for each unique key in a PCollection of key-values.

In [9]:
with beam.Pipeline() as pipeline:
  elements_with_max_value_per_key = (
      pipeline
      | 'Create produce' >> beam.Create([
          ('🥕', 3),
          ('🥕', 2),
          ('🍆', 1),
          ('🍅', 4),
          ('🍅', 5),
          ('🍅', 3),
      ])
      | 'Get max value per key' >> beam.CombinePerKey(max)
      | beam.Map(print))

('🥕', 3)
('🍆', 1)
('🍅', 5)


## **Min:**

*	Gets the element with the minimum value within each aggregation.
* we create a pipeline with a PCollection. Then, we get the element with the minimum value in different ways.




In [10]:
with beam.Pipeline() as pipeline:
  min_element = (
      pipeline
      | 'Create numbers' >> beam.Create([3, 4, 1, 2])
      | 'Get min value' >>
      beam.CombineGlobally(lambda elements: min(elements or [-1]))
      | beam.Map(print))

1


Combine.PerKey() to get the minimum element for each unique key in a PCollection of key-values.

In [11]:
with beam.Pipeline() as pipeline:
  elements_with_min_value_per_key = (
      pipeline
      | 'Create produce' >> beam.Create([
          ('🥕', 3),
          ('🥕', 2),
          ('🍆', 1),
          ('🍅', 4),
          ('🍅', 5),
          ('🍅', 3),
      ])
      | 'Get min value per key' >> beam.CombinePerKey(min)
      | beam.Map(print))

('🥕', 2)
('🍆', 1)
('🍅', 3)


## **Mean:**

*	Transforms for computing the arithmetic mean of the elements in a collection, or the mean of the values associated with each key in a collection of key-value pairs.
* we create a pipeline with a PCollection. Then, we get the element with the average value in different ways.




In [12]:
with beam.Pipeline() as pipeline:
  mean_element = (
      pipeline
      | 'Create numbers' >> beam.Create([3, 4, 1, 2])
      | 'Get mean value' >> beam.combiners.Mean.Globally()
      | beam.Map(print))

2.5


**Mean.PerKey() to get the average of the elements for each unique key in a PCollection of key-values.**

In [13]:
with beam.Pipeline() as pipeline:
  elements_with_mean_value_per_key = (
      pipeline
      | 'Create produce' >> beam.Create([
          ('🥕', 3),
          ('🥕', 2),
          ('🍆', 1),
          ('🍅', 4),
          ('🍅', 5),
          ('🍅', 3),
      ])
      | 'Get mean value per key' >> beam.combiners.Mean.PerKey()
      | beam.Map(print))

('🥕', 2.5)
('🍆', 1.0)
('🍅', 4.0)


## **Sample:**

*	Transforms for taking samples of the elements in a collection, or samples of the values associated with each key in a collection of key-value pairs.
* we create a pipeline with a PCollection. Then, we get a random sample of elements in different ways.




Sample.FixedSizeGlobally() to get a fixed-size random sample of elements from the entire PCollection.

In [16]:
with beam.Pipeline() as pipeline:
  sample = (
      pipeline
      | 'Create produce' >> beam.Create([
          '🍓 Strawberry',
          '🥕 Carrot',
          '🍆 Eggplant',
          '🍅 Tomato',
          '🥔 Potato',
      ])
      | 'Sample N elements' >> beam.combiners.Sample.FixedSizeGlobally(3)
      | beam.Map(print))

['🥔 Potato', '🍅 Tomato', '🍆 Eggplant']


In [18]:
with beam.Pipeline() as pipeline:
  samples_per_key = (
      pipeline
      | 'Create produce' >> beam.Create([
          ('spring', '🍓'),
          ('spring', '🥕'),
          ('spring', '🍆'),
          ('spring', '🍅'),
          ('summer', '🥕'),
          ('summer', '🍅'),
          ('summer', '🌽'),
          ('fall', '🥕'),
          ('fall', '🍅'),
          ('winter', '🍆'),
      ])
      | 'Samples per key' >> beam.combiners.Sample.FixedSizePerKey(3)
      | beam.Map(print))

('spring', ['🥕', '🍅', '🍓'])
('summer', ['🌽', '🍅', '🥕'])
('fall', ['🥕', '🍅'])
('winter', ['🍆'])


## **Sum:**

* Sums all the elements within each aggregation.
* we create a pipeline with a PCollection. Then, we get the sum of all the element values in different ways.




In [None]:
with beam.Pipeline() as pipeline:
  total = (
      pipeline
      | 'Create numbers' >> beam.Create([3, 4, 1, 2])
      | 'Sum values' >> beam.CombineGlobally(sum)
      | beam.Map(print))



10


Combine.PerKey() to get the sum of all the element values for each unique key in a PCollection of key-values.

In [19]:
with beam.Pipeline() as pipeline:
  totals_per_key = (
      pipeline
      | 'Create produce' >> beam.Create([
          ('🥕', 3),
          ('🥕', 2),
          ('🍆', 1),
          ('🍅', 4),
          ('🍅', 5),
          ('🍅', 3),
      ])
      | 'Sum values per key' >> beam.CombinePerKey(sum)
      | beam.Map(print))

('🥕', 5)
('🍆', 1)
('🍅', 12)


## **Top:**

* Transforms for finding the largest (or smallest) set of elements in a collection, or the largest (or smallest) set of values associated with each key in a collection of key-value pairs.
* we create a pipeline with a PCollection. Then, we get the largest or smallest elements in different ways.




In [21]:
with beam.Pipeline() as pipeline:
  largest_elements = (
      pipeline
      | 'Create numbers' >> beam.Create([3, 4, 1, 2])
      | 'Largest N values' >> beam.combiners.Top.Largest(2)
      | beam.Map(print))



[4, 3]


Top.LargestPerKey() to get the largest elements for each unique key in a PCollection of key-values.

In [22]:
with beam.Pipeline() as pipeline:
  largest_elements_per_key = (
      pipeline
      | 'Create produce' >> beam.Create([
          ('🥕', 3),
          ('🥕', 2),
          ('🍆', 1),
          ('🍅', 4),
          ('🍅', 5),
          ('🍅', 3),
      ])
      | 'Largest N values per key' >> beam.combiners.Top.LargestPerKey(2)
      | beam.Map(print))

('🥕', [3, 2])
('🍆', [1])
('🍅', [5, 4])
