In [2]:
import apache_beam as beam

In [101]:
#ex1 - End-to-end simple pipeline
def SplitRow(element):
    return element.split(',')

#p1 = beam.Pipeline()

with beam.Pipeline() as p1:
    
    count = (
        p1
        | beam.io.ReadFromText('data/dept_data.txt')
        | beam.Map(lambda row:row.split(',')) #beam.Map(SplitRow)
        | beam.Filter(lambda row:row[3]=='Accounts')
        | beam.Map(lambda record:(record[1],1))
        | beam.CombinePerKey(sum)
        | beam.io.WriteToText('data/output')
    )

#p1.run()



In [102]:
!head data/output-00000-of-00001

149633CM
Marco
10
Accounts
1-01-2019
212539MU
Rebekah
10
Accounts
1-01-2019


In [10]:
#ex2 - Barnching Pipeline
with beam.Pipeline() as p2:
    
    input_pcollection = (
        p2
        | "Read from text file" >> beam.io.ReadFromText('data/dept_data.txt')
        | "Split rows" >> beam.Map(lambda row:row.split(','))
    )
    
    accounts_count =(
        input_pcollection
        | "Get accounts dept persons" >> beam.Filter(lambda row:row[3]=='Accounts')
        | "Combining with 1" >> beam.Map(lambda record:(record[1],1))
        | "Group and sum" >> beam.CombinePerKey(sum)
        | "Write results for accounts" >> beam.io.WriteToText('data/accounts')
    )
    
    hr_count =(
        input_pcollection
        | "Get HR dept persons" >> beam.Filter(lambda row:row[3]=='HR')
        | "Combining with 1 - HR" >> beam.Map(lambda record:(record[1],1))
        | "Group and sum - HR" >> beam.CombinePerKey(sum)
        | "write results for HR" >> beam.io.WriteToText('data/HR')
    )
    
    '''
    union = (
        (accounts_count, hr_count)
        | beam.Flatten()
        | "write results for union" >> beam.io.WriteToText('data/union')
    )'''



In [11]:
!head data/HR-00000-of-00001
#!!head data/accounts-00000-of-00001
#!head data/union-00000-of-00001

('Beryl', 62)
('Olga', 31)
('Leslie', 31)
('Mindy', 31)
('Vicky', 31)
('Richard', 31)
('Kirk', 31)
('Kaori', 31)
('Oscar', 31)


In [9]:
#ex3 - ParDo

class SplitRow(beam.DoFn):
    
    def process(self, element):
        return [element.split(',')]

with beam.Pipeline() as p3:
    
    count = (
        p3
        | beam.io.ReadFromText('data/dept_data.txt')
        #| beam.ParDo(SplitRow())   #beam.Map(lambda row:row.split(','))
        | beam.ParDo(lambda element: [element.split(',')])
        | beam.Filter(lambda row:row[3]=='Accounts')
        | beam.Map(lambda record:(record[1],1))
        | beam.CombinePerKey(sum)
        | beam.io.WriteToText('data/output')
    )



In [10]:
!head data/output-00000-of-00001

('Marco', 31)
('Rebekah', 31)
('Itoe', 31)
('Edouard', 31)
('Kyle', 62)
('Kumiko', 31)
('Gaston', 31)
('Ayumi', 30)


In [12]:
#ex4 - Combiner
import apache_beam as beam

class AverageFunc(beam.CombineFn):
    
    def create_accumulator(self):
        return (0.0,0) #initialize (0.0,0)
    
    def add_input(self, sum_count, input):
        (sum, count) = sum_count
        return sum + input, count + 1
    
    def merge_accumulators(self, accumulators):
        ind_sums, ind_counts = zip(*accumulators) #zip [(1,2),(3,4),(5,6)] -> [(1,3,5),(2,4,6)]
        return sum(ind_sums), sum(ind_counts) # [(9,12)]
    
    def extract_output(self,  sum_count):
        (sum, count) = sum_count #combine globally
        return sum/count if count else float('NaN')

with beam.Pipeline() as p4:
    
    small_sum = (
        p4
        | beam.Create([1,2,3,4,5,6,7,8])
        | beam.CombineGlobally(AverageFunc())
        | 'Write Results' >> beam.io.WriteToText('data/combine')
    )



In [13]:
!head data/combine-00000-of-00001

4.5


In [2]:
#ex5 - Composite Transforms
import apache_beam as beam

class MyTransform(beam.PTransform):
    
    def expand(self, input_call):
        a = (
            input_call
            | "Combining with 1" >> beam.Map(lambda record:(record[1],1))
            | "Group and sum" >> beam.CombinePerKey(sum)
        )
        return a

with beam.Pipeline() as p5:
    
    input_pcollection = (
        p5
        | "Read from text file" >> beam.io.ReadFromText('data/dept_data.txt')
        | "Split rows" >> beam.Map(lambda row:row.split(','))
    )
    
    accounts_count =(
        input_pcollection
        | "Get accounts dept persons" >> beam.Filter(lambda row:row[3]=='Accounts')
        | "Composite transform MyTransform" >> MyTransform()
        | "Write results for accounts" >> beam.io.WriteToText('data/accounts')
    )
    
    hr_count =(
        input_pcollection
        | "Get HR dept persons" >> beam.Filter(lambda row:row[3]=='HR')
        | "Composite transform MyTransform - HR" >> MyTransform()
        | "write results for HR" >> beam.io.WriteToText('data/HR')
    )
    
    '''
    union = (
        (accounts_count, hr_count)
        | beam.Flatten()
        | "write results for union" >> beam.io.WriteToText('data/union')
    )'''



In [3]:
!head data/accounts-00000-of-00001

('Marco', 31)
('Rebekah', 31)
('Itoe', 31)
('Edouard', 31)
('Kyle', 62)
('Kumiko', 31)
('Gaston', 31)
('Ayumi', 30)


In [13]:
#ex6 - CoGroupByKey
import apache_beam as beam

with beam.Pipeline() as p6:
    
    dep_rows = (
        p6
        | "Read dep file - dep" >> beam.io.ReadFromText('data/dept_data.txt')
        | "Split rows - dep" >> beam.Map(lambda row:(row.split(',')[0],row.split(',')[1:]))
    )
    
    location_rows = (
        p6
        | "Read dep file - location" >> beam.io.ReadFromText('data/location.txt')
        | "Split rows - location" >> beam.Map(lambda row:(row.split(',')[0],row.split(',')[1:]))
        
    )
    
    results =(
        {"dep_data": dep_rows, "loc_data": location_rows}
        | "Cogroup transform" >> beam.CoGroupByKey()
        | "write results" >> beam.io.WriteToText('data/cogroup')
    )



In [14]:
!head data/cogroup-00000-of-00001

('149633CM', {'dep_data': [['Marco', '10', 'Accounts', '1-01-2019'], ['Marco', '10', 'Accounts', '2-01-2019'], ['Marco', '10', 'Accounts', '3-01-2019'], ['Marco', '10', 'Accounts', '4-01-2019'], ['Marco', '10', 'Accounts', '5-01-2019'], ['Marco', '10', 'Accounts', '6-01-2019'], ['Marco', '10', 'Accounts', '7-01-2019'], ['Marco', '10', 'Accounts', '8-01-2019'], ['Marco', '10', 'Accounts', '9-01-2019'], ['Marco', '10', 'Accounts', '10-01-2019'], ['Marco', '10', 'Accounts', '11-01-2019'], ['Marco', '10', 'Accounts', '12-01-2019'], ['Marco', '10', 'Accounts', '13-01-2019'], ['Marco', '10', 'Accounts', '14-01-2019'], ['Marco', '10', 'Accounts', '15-01-2019'], ['Marco', '10', 'Accounts', '16-01-2019'], ['Marco', '10', 'Accounts', '17-01-2019'], ['Marco', '10', 'Accounts', '18-01-2019'], ['Marco', '10', 'Accounts', '19-01-2019'], ['Marco', '10', 'Accounts', '20-01-2019'], ['Marco', '10', 'Accounts', '21-01-2019'], ['Marco', '10', 'Accounts', '22-01-2019'], ['Marco', '10', 'Accounts', '23-01-2