### Side Inputs

In addition to the main input PCollection, you can provide additional inputs to a ParDo transform in the form of side inputs. A side input is an additional input that your DoFn can access each time it processes an element in the input PCollection. When you specify a side input, you create a view of some other data that can be read from within the ParDo transform’s DoFn while processing each element.

Side inputs are useful if your ParDo needs to inject additional data when processing each element in the input PCollection, but the additional data needs to be determined at runtime (and not hard-coded). Such values might be determined by the input data, or depend on a different branch of your pipeline.

In [4]:
import apache_beam as beam

side_list=list()
# IDs to be excluded:
with open ('data/exclude_ids.txt','r') as my_file:
    for line in my_file:
        side_list.append(line.rstrip())

# We can pass side inputs to a ParDo transform, which will get passed to its process method.
# The first two arguments for the process method would be self and element.

class FilterUsingLength(beam.DoFn):
    def process(self, element, side_list, lower_bound, upper_bound=float('inf')):
        id = element.split(',')[0]
        name = element.split(',')[1]
        # id = id.decode('utf-8','ignore').encode("utf-8")
        element_list= element.split(',')
        if (lower_bound <= len(name) <= upper_bound) and id not in side_list:
            yield element_list

with beam.Pipeline() as pipe:
    """ using pardo to filter names with length between 3 and 10 and exclude IDs with side inputs """
    small_names =(
        pipe
        | "Read file" >> beam.io.ReadFromText('data/dept_data.txt')
        | "ParDo with side inputs" >> beam.ParDo(FilterUsingLength(), side_list, 3, 10) 
        | beam.Filter(lambda record: record[3] == 'Accounts')
        | beam.Map(lambda record: (record[0]+ " " + record[1], 1))
        | beam.CombinePerKey(sum)
        # | 'Write results' >> beam.io.WriteToText('output/output_new_final')
        | 'Print results' >> beam.Map(print)
    )

# visualize data
# !{('head -n 20 output/output_new_final-00000-of-00001')}

('503996WI Edouard', 31)
('957149WC Kyle', 31)
('241316NX Kumiko', 31)
('796656IE Gaston', 31)
('718737IX Ayumi', 30)


### Multiple outputs

In [5]:
import apache_beam as beam

# DoFn function 
class ProcessWords(beam.DoFn):
    def process(self, element, cutoff_length, marker):
        name = element.split(',')[1]
    
        if len(name) <= cutoff_length:
            yield beam.pvalue.TaggedOutput('Short_Names', name)
        else:
            yield beam.pvalue.TaggedOutput('Long_Names', name)
        
        if name.startswith(marker):
            yield name
        
        
with beam.Pipeline() as pipe:
    results = (
        pipe
        | beam.io.ReadFromText('data/dept_data.txt')
        | beam.ParDo(ProcessWords(), cutoff_length=4, marker='A').with_outputs('Short_Names', 
                                                                               'Long_Names', 
                                                                               main='Names_A')
    )

short_collection = results.Short_Names
long_collection = results.Long_Names
startA_collection = results.Names_A  

# write to file  
short_collection | 'Write 1'>> beam.io.WriteToText('output/short')

# write to file
long_collection | 'Write 2'>> beam.io.WriteToText('output/long')

# write to file
startA_collection | 'Write 3'>> beam.io.WriteToText('output/start_a')

!{'head -n 5 output/short-00000-of-00001'}
!{'head -n 5 output/long-00000-of-00001'}
!{'head -n 5 output/start_a-00000-of-00001'}

Itoe
Kyle
Kyle
Olga
Kirk
Marco
Rebekah
Edouard
Kumiko
Gaston
Ayumi
Ayumi
Ayumi
Ayumi
Ayumi


In [6]:
import apache_beam as beam

# DoFn function 
class ProcessWords(beam.DoFn):
    def process(self, element, cutoff_length, marker):
        name = element.split(',')[1]

        if len(name) <= cutoff_length:
            yield beam.pvalue.TaggedOutput('Short_Names', name)
        else:
            yield beam.pvalue.TaggedOutput('Long_Names', name)
        
        if name.startswith(marker):
            yield name 
    
with beam.Pipeline() as pipe:
    # the main tag (if specified) first:
    startA_collection, short_collection, long_collection = (
        pipe
        | beam.io.ReadFromText('data/dept_data.txt')
        | beam.ParDo(ProcessWords(), cutoff_length=4, marker='A').with_outputs('Short_Names',
                                                                               'Long_Names',
                                                                               main='Names_A')
    )

# write to file  
short_collection | 'Write 1'>> beam.io.WriteToText('output/short')

# write to file
long_collection | 'Write 2'>> beam.io.WriteToText('output/long')

# write to file
startA_collection | 'Write 3'>> beam.io.WriteToText('output/start_a')

!{'head -n 5 output/short-00000-of-00001'}
!{'head -n 5 output/long-00000-of-00001'}
!{'head -n 5 output/start_a-00000-of-00001'}

Itoe
Kyle
Kyle
Olga
Kirk
Marco
Rebekah
Edouard
Kumiko
Gaston
Ayumi
Ayumi
Ayumi
Ayumi
Ayumi
