In [1]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

  'Running the Apache Beam SDK on Python 3 is not yet fully supported. '


In [2]:
# ParDo

In [3]:
class ComputeWordLengthFn(beam.DoFn):
    def process(self, x):
        return [len(x)]

with beam.Pipeline(options=PipelineOptions()) as p:

    lines = p | beam.Create([
        'this', 'is', 'a', 'list'
    ])

    word_lengths = lines | beam.ParDo(ComputeWordLengthFn())

    (word_lengths | beam.io.WriteToText('output.txt'))

In [4]:
!cat output.txt-00000-of-00001

4
2
1
4


In [5]:
with beam.Pipeline(options=PipelineOptions()) as p:
    lines = p | beam.Create([
        'this', 'is', 'a', 'list'
    ])

    word_lengths = lines | beam.Map(lambda x: '%s,%s'%(x,len(x)))

    (word_lengths | beam.io.WriteToText('output-2.txt'))

In [6]:
!cat output-2.txt-00000-of-00001

this,4
is,2
a,1
list,4


In [7]:
# GroupByKey

In [13]:
with beam.Pipeline(options=PipelineOptions()) as p:
    lines = p | beam.io.ReadFromText('group-by-key-input.csv')
    
    key_values = lines | beam.Map(lambda x: x.split(','))
    
    group_by = key_values | beam.GroupByKey()
    
    format_out_lines = group_by | beam.Map(lambda x: '%s,%s'%(x[0], x[1]))
    
    (format_out_lines | beam.io.WriteToText('output-3.txt'))

In [14]:
!cat output-3.txt-00000-of-00001

cat,['1', '5', '9']
dog,['5', '2']
and,['1', '2', '6']
jump,['3']
tree,['2']


In [15]:
# CoGroupByKey

In [16]:
with beam.Pipeline(options=PipelineOptions()) as p:
    emails_list = [
        ('amy', 'amy@example.com'),
        ('carl', 'carl@example.com'),
        ('julia', 'julia@example.com'),
        ('carl', 'carl@email.com'),
    ]
    phones_list = [
        ('amy', '111-222-3333'),
        ('james', '222-333-4444'),
        ('amy', '333-444-5555'),
        ('carl', '444-555-6666'),
    ]

    emails = p | 'CreateEmails' >> beam.Create(emails_list)
    phones = p | 'CreatePhones' >> beam.Create(phones_list)

    results = ({'emails': emails, 'phones': phones}
           | beam.CoGroupByKey())

    def join_info(name_info):
      (name, info) = name_info
      return '%s; %s; %s' %\
          (name, sorted(info['emails']), sorted(info['phones']))

    contact_lines = results | beam.Map(join_info)
    
    (contact_lines | beam.io.WriteToText('output-4.txt'))

In [17]:
!cat output-4.txt-00000-of-00001

amy; ['amy@example.com']; ['111-222-3333', '333-444-5555']
james; []; ['222-333-4444']
carl; ['carl@email.com', 'carl@example.com']; ['444-555-6666']
julia; ['julia@example.com']; []
