# Interactive Beam Demo

## Requirement

1. Install [graphviz](https://graphviz.gitlab.io/) via `apt install graphviz` or `conda install -c anaconda graphviz`
2. Install [graphviz Python library](https://github.com/xflr6/graphviz) via `pip install graphviz`

In [1]:
import apache_beam as beam
from apache_beam.runners.interactive import interactive_runner
from apache_beam.runners.direct import direct_runner

interactive_runner.SAMPLE_SIZE = 8

In [2]:
import time
@beam.ptransform_fn
def Sleep(pcoll, secs):
    return (pcoll
            | beam.Map(lambda x: (None, x))
            | beam.GroupByKey()
            | beam.Map(lambda kvs: time.sleep(secs) or kvs)
            | beam.FlatMap(lambda (k, vs): vs))

In [3]:
p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
    underlying_runner=direct_runner.BundleBasedDirectRunner()))

init_pcoll = p |  beam.Create([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) | Sleep(2)
squares = init_pcoll | 'Square' >> beam.Map(lambda x: x*x)
cubes = init_pcoll | 'Cube' >> beam.Map(lambda x: x**3)

result = p.run(False)

Running...

Using 0 cached PCollections
Executing 9 of 9 transforms.

Create produced {8, 2, 10, 1, 3, ...}

Sleep(2)/Map(<lambda at <ipython-input-2-c11a5999ca56>:5>) produced {(None, 5), (None, 7), (None, 6), (None, 10), (None, 2), ...}

Sleep(2)/GroupByKey/ReifyWindows produced {(None, (9, MIN_TIMESTAMP, [GlobalWi..., (None, (1, MIN_TIMESTAMP, [GlobalWi..., (None, (3, MIN_TIMESTAMP, [GlobalWi..., (None, (5, MIN_TIMESTAMP, [GlobalWi..., (None, (8, MIN_TIMESTAMP, [GlobalWi..., ...}

Sleep(2)/GroupByKey/GroupByKey produced {(None, [(1, MIN_TIMESTAMP, [GlobalW...}

Sleep(2)/GroupByKey produced {(None, [1, 2, 3, 4, 5, 6, 7, 8, 9, ...}

Sleep(2)/Map(<lambda at <ipython-input-2-c11a5999ca56>:7>) produced {(None, [1, 2, 3, 4, 5, 6, 7, 8, 9, ...}

Sleep(2) produced {6, 1, 10, 4, 2, ...}

Cube produced {64, 1, 216, 27, 729, ...}

Square produced {4, 9, 25, 36, 49, ...}

In [4]:
class AverageFn(beam.CombineFn):
  def create_accumulator(self):
    return (0.0, 0)

  def add_input(self, sum_count, input):
    (sum, count) = sum_count
    return sum + input, count + 1

  def merge_accumulators(self, accumulators):
    sums, counts = zip(*accumulators)
    return sum(sums), sum(counts)

  def extract_output(self, sum_count):
    (sum, count) = sum_count
    return sum / count if count else float('NaN')

In [5]:
average_sq = squares | 'Average' >> beam.CombineGlobally(AverageFn())
result = p.run(False)

Running...

Using 1 cached PCollections
Executing 8 of 17 transforms.

Square produced {4, 9, 25, 36, 49, ...}

Average/DoOnce produced {None}

Average/KeyWithVoid produced {(None, 1), (None, 9), (None, 25), (None, 49), (None, 36), ...}

Average/CombinePerKey/GroupByKey/ReifyWindows produced {(None, (100, MIN_TIMESTAMP, [Global..., (None, (25, MIN_TIMESTAMP, [GlobalW..., (None, (49, MIN_TIMESTAMP, [GlobalW..., (None, (81, MIN_TIMESTAMP, [GlobalW..., (None, (16, MIN_TIMESTAMP, [GlobalW..., ...}

Average/CombinePerKey/GroupByKey/GroupByKey produced {(None, [(1, MIN_TIMESTAMP, [GlobalW...}

Average/CombinePerKey/GroupByKey produced {(None, [1, 4, 9, 16, 25, 36, 49, 64...}

Average/CombinePerKey/Combine/ParDo(CombineValuesDoFn) produced {(None, 38.5)}

Average/UnKey produced {38.5}

Average/InjectDefault produced {38.5}

In [6]:
class SumFn(beam.CombineFn):
  def create_accumulator(self):
    return 0.0

  def add_input(self, sum, input):
    return sum + input

  def merge_accumulators(self, accumulators):
    return sum(accumulators)

  def extract_output(self, sum):
    return sum

In [None]:
sum_sq = squares | 'Sum' >> beam.CombineGlobally(SumFn())
result = p.run(False)

In [None]:
print(list(result.get(squares)))
print(list(result.get(average_sq)))
print(list(result.get(sum_sq)))