In [None]:
!pip3 install apache_beam

In [None]:
import apache_beam as beam

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd drive/My\ Drive/Colab\ Notebooks/Cloud-AI-Analytics/Apache\ Beam\ -Python/data

In [None]:
!ls

In [None]:
!cat cloud_export.txt

## **ParDo:**

•	ParDo is a Beam transform for generic parallel processing.

•	The ParDo processing paradigm is similar to the “Map” phase of a Map/Shuffle/Reduce-style algorithm: a ParDo transform considers each element in the input PCollection, performs some processing function (your user code) on that element, and emits zero, one, or multiple elements to an output PCollection.


In [None]:
class SplitRow(beam.DoFn):
  def process(self, element):
    return [element.split(',')]


class ComputeWordLengthFn(beam.DoFn):
  def process(self, element):
    return [len(element)]

In [None]:
with beam.Pipeline() as pipeline:
  input_data = (pipeline
                | "read from text">> beam.io.ReadFromText("cloud_export.txt", skip_header_lines= True)
                | "spliting the record" >> beam.ParDo(SplitRow()))

  count_data = (input_data
                |"filtering the data with PASS" >> beam.Filter(lambda record : record[2]=="Support"))

  word_lengths = (count_data
                 |"count of records" >> beam.ParDo(ComputeWordLengthFn())
                 |beam.Map(print))

  output_data = (count_data
                 | "Write to Text" >> beam.io.WriteToText("result/Support_service"))

In [None]:
with beam.Pipeline() as pipeline:
  input_data = (pipeline
                | "read from text">> beam.io.ReadFromText("cloud_export.txt", skip_header_lines= True)
                | "spliting the record" >> beam.ParDo(SplitRow()))

  count_data = (input_data
                |"filtering the data with PASS" >> beam.Filter(lambda record : record[2]=="EC2"))

  word_lengths = (count_data
                 |"count of records" >> beam.ParDo(ComputeWordLengthFn())
                 |beam.Map(print))

  output_data = (count_data
                 | "Write to Text" >> beam.io.WriteToText("result/EC2_service"))


In [None]:
!ls result/

In [None]:
!{('head -n 10 result/Support_service-00000-of-00001')}

In [None]:
!{('head -n 10 result/EC2_service-00000-of-00001')}

## **Keys:**

Takes a collection of key-value pairs and returns the key of each element.


In [None]:
with beam.Pipeline() as pipeline:
  icons = (
      pipeline
      | 'Garden plants' >> beam.Create([
          ('🍓', 'Strawberry'),
          ('🥕', 'Carrot'),
          ('🍆', 'Eggplant'),
          ('🍅', 'Tomato'),
          ('🥔', 'Potato'),
      ])
      | 'Keys' >> beam.Keys()
      | beam.Map(print))

## **Values:**

Takes a collection of key-value pairs, and returns the value of each element.



In [None]:
with beam.Pipeline() as pipeline:
  plants = (
      pipeline
      | 'Garden plants' >> beam.Create([
          ('🍓', 'Strawberry'),
          ('🥕', 'Carrot'),
          ('🍆', 'Eggplant'),
          ('🍅', 'Tomato'),
          ('🥔', 'Potato'),
      ])
      | 'Values' >> beam.Values()
      | beam.Map(print))

## **ToString**

Transforms every element in an input collection to a string. Any non-string element can be converted to a string using standard Python functions and methods. Many I/O transforms, such as textio.WriteToText, expect their input elements to be strings.



1. Key-value pairs to string
2. Elements to string
3. Iterables to string

In [None]:
with beam.Pipeline() as pipeline:
  plants = (
      pipeline
      | 'Garden plants' >> beam.Create([
          ('🍓', 'Strawberry'),
          ('🥕', 'Carrot'),
          ('🍆', 'Eggplant'),
          ('🍅', 'Tomato'),
          ('🥔', 'Potato'),
      ])
      | 'To string' >> beam.ToString.Kvs()  #Element() #Iterables()
      | beam.Map(print))

## **Kvswap :**

•	Takes a collection of key-value pairs and returns a collection of key-value pairs which has each key and value swapped.


In [None]:
with beam.Pipeline() as pipeline:
  plants = (
      pipeline
      | 'Garden plants' >> beam.Create([
          ('🍓', 'Strawberry'),
          ('🥕', 'Carrot'),
          ('🍆', 'Eggplant'),
          ('🍅', 'Tomato'),
          ('🥔', 'Potato'),
      ])
      | 'Key-Value swap' >> beam.KvSwap()
      | beam.Map(print))