### Pipeline - Using Map
#### step by step


In [5]:
import apache_beam as beam

p1 = beam.Pipeline()

# Read / Write pipeline:
attendance_count = (
    p1
    |beam.io.ReadFromText('data/dept_data.txt')
    
    |beam.io.WriteToText('output/output_map1')
)

p1.run()

# visualize output
!{('head -n 10 output/output_map1-00000-of-00001')}



149633CM,Marco,10,Accounts,1-01-2019
212539MU,Rebekah,10,Accounts,1-01-2019
231555ZZ,Itoe,10,Accounts,1-01-2019
503996WI,Edouard,10,Accounts,1-01-2019
704275DC,Kyle,10,Accounts,1-01-2019
957149WC,Kyle,10,Accounts,1-01-2019
241316NX,Kumiko,10,Accounts,1-01-2019
796656IE,Gaston,10,Accounts,1-01-2019
331593PS,Beryl,20,HR,1-01-2019
560447WH,Olga,20,HR,1-01-2019


In [7]:
import apache_beam as beam

p1 = beam.Pipeline()

# Split the data:
attendance_count = (
    p1
    |beam.io.ReadFromText('data/dept_data.txt')
    |beam.Map(lambda record: record.split(','))
    |beam.io.WriteToText('output/output_split')
  
)

p1.run()

# visualize output
!{('head -n 10 output/output_split-00000-of-00001')}



['149633CM', 'Marco', '10', 'Accounts', '1-01-2019']
['212539MU', 'Rebekah', '10', 'Accounts', '1-01-2019']
['231555ZZ', 'Itoe', '10', 'Accounts', '1-01-2019']
['503996WI', 'Edouard', '10', 'Accounts', '1-01-2019']
['704275DC', 'Kyle', '10', 'Accounts', '1-01-2019']
['957149WC', 'Kyle', '10', 'Accounts', '1-01-2019']
['241316NX', 'Kumiko', '10', 'Accounts', '1-01-2019']
['796656IE', 'Gaston', '10', 'Accounts', '1-01-2019']
['331593PS', 'Beryl', '20', 'HR', '1-01-2019']
['560447WH', 'Olga', '20', 'HR', '1-01-2019']


In [8]:
import apache_beam as beam

p1 = beam.Pipeline()

# Split + filter the data:
attendance_count = (
    p1
    |beam.io.ReadFromText('data/dept_data.txt')
    |beam.Map(lambda record: record.split(','))
    |beam.Filter(lambda record: record[3] == 'Accounts')
    |beam.io.WriteToText('output/output_filter')
  
)

p1.run()

# visualize output
!{('head -n 20 output/output_filter-00000-of-00001')}

['149633CM', 'Marco', '10', 'Accounts', '1-01-2019']
['212539MU', 'Rebekah', '10', 'Accounts', '1-01-2019']
['231555ZZ', 'Itoe', '10', 'Accounts', '1-01-2019']
['503996WI', 'Edouard', '10', 'Accounts', '1-01-2019']
['704275DC', 'Kyle', '10', 'Accounts', '1-01-2019']
['957149WC', 'Kyle', '10', 'Accounts', '1-01-2019']
['241316NX', 'Kumiko', '10', 'Accounts', '1-01-2019']
['796656IE', 'Gaston', '10', 'Accounts', '1-01-2019']
['149633CM', 'Marco', '10', 'Accounts', '2-01-2019']
['212539MU', 'Rebekah', '10', 'Accounts', '2-01-2019']
['231555ZZ', 'Itoe', '10', 'Accounts', '2-01-2019']
['503996WI', 'Edouard', '10', 'Accounts', '2-01-2019']
['704275DC', 'Kyle', '10', 'Accounts', '2-01-2019']
['957149WC', 'Kyle', '10', 'Accounts', '2-01-2019']
['241316NX', 'Kumiko', '10', 'Accounts', '2-01-2019']
['796656IE', 'Gaston', '10', 'Accounts', '2-01-2019']
['718737IX', 'Ayumi', '10', 'Accounts', '2-01-2019']
['149633CM', 'Marco', '10', 'Accounts', '3-01-2019']
['212539MU', 'Rebekah', '10', 'Accounts'

In [11]:
import apache_beam as beam

p1 = beam.Pipeline()

# Split + filter + key-value:
attendance_count = (
    p1
    |beam.io.ReadFromText('data/dept_data.txt')
    |beam.Map(lambda record: record.split(','))
    |beam.Filter(lambda record: record[3] == 'Accounts')
    |beam.Map(lambda record: (record[1], 1))
    |beam.io.WriteToText('output/output_kv')
  
)

p1.run()

# visualize output
!{('head -n 10 output/output_kv-00000-of-00001')}

('Marco', 1)
('Rebekah', 1)
('Itoe', 1)
('Edouard', 1)
('Kyle', 1)
('Kyle', 1)
('Kumiko', 1)
('Gaston', 1)
('Marco', 1)
('Rebekah', 1)


In [22]:
import apache_beam as beam

with beam.Pipeline() as pipe:
    # Split + filter + key-value + Sum by key:
    attendance_count = (
        pipe
        |"Read file" >> beam.io.ReadFromText('data/dept_data.txt')
        |"Split"     >> beam.Map(lambda record: record.split(','))
        |"Filter"    >> beam.Filter(lambda record: record[3] == 'Accounts')
        |"KV"        >> beam.Map(lambda record: (record[1], 1))     # |beam.Map(lambda record: (record[1], 1))  # By ID
        |"Sum"       >> beam.CombinePerKey(sum)   # GropuBy + Combiner + Reducer
        |"Output"    >> beam.io.WriteToText('output/output_sum')
    )

# visualize output
!{('head -n 10 output/output_sum-00000-of-00001')}



('Marco', 31)
('Rebekah', 31)
('Itoe', 31)
('Edouard', 31)
('Kyle', 62)
('Kumiko', 31)
('Gaston', 31)
('Ayumi', 30)
