In [None]:
!pip3 install apache_beam

In [None]:
import apache_beam as beam

## Map

•	Applies a simple 1-to-1 mapping function over each element in the collection.

In [None]:
def strip_header_and_newline(text):
  return text.strip('# \n')

with beam.Pipeline() as pipeline:
  plants = (
      pipeline
      | 'Gardening plants' >> beam.Create([
          '# 🍓Strawberry\n',
          '# 🥕Carrot\n',
          '# 🍆Eggplant\n',
          '# 🍅Tomato\n',
          '# 🥔Potato\n',
      ])
      | 'Strip header' >> beam.Map(strip_header_and_newline)
      | beam.Map(print))

**MapTuple** for key-value pairs. If your PCollection consists of (key, value) pairs, you can use MapTuple to unpack them into different function arguments.

In [None]:
# MapTuple for key-value pairs
with beam.Pipeline() as pipeline:
  plants = (
      pipeline
      | 'Gardening plants' >> beam.Create([
          ('🍓', 'Strawberry'),
          ('🥕', 'Carrot'),
          ('🍆', 'Eggplant'),
          ('🍅', 'Tomato'),
          ('🥔', 'Potato'),
      ])
      | 'Format' >> beam.MapTuple(lambda icon, plant: '{}{}'.format(icon, plant))
      | beam.Map(print))

## FlatMap

•	Applies a simple 1-to-many mapping function over each element in the collection. The many elements are flattened into the resulting collection.

In [None]:
def split_words(text):
  return text.split(',')

with beam.Pipeline() as pipeline:
  plants = (
      pipeline
      | 'Gardening plants' >> beam.Create([
          '🍓Strawberry,🥕Carrot,🍆Eggplant',
          '🍅Tomato,🥔Potato',
      ])
      | 'Split words' >> beam.FlatMap(split_words)
      | beam.Map(print))

**FlatMapTuple for key-value pairs**

If your PCollection consists of (key, value) pairs, you can use FlatMapTuple to unpack them into different function arguments.

In [None]:
def format_plant(icon, plant):
  if icon:
    yield '{}{}'.format(icon, plant)

with beam.Pipeline() as pipeline:
  plants = (
      pipeline
      | 'Gardening plants' >> beam.Create([
          ('🍓', 'Strawberry'),
          ('🥕', 'Carrot'),
          ('🍆', 'Eggplant'),
          ('🍅', 'Tomato'),
          ('🥔', 'Potato'),
          (None, 'Invalid'),
      ])
      | 'Format' >> beam.FlatMapTuple(format_plant)
      | beam.Map(print))

## Filter

Given a predicate, filter out all elements that don’t satisfy that predicate. May also be used to filter based on an inequality with a given value based on the comparison ordering of the element.

In [None]:
def is_ec2(cdr):
  return cdr['service.description'] == '4567-001A'

with beam.Pipeline() as pipeline:
  perennials = (
      pipeline
      | 'cloud_cdr' >> beam.Create([
          {
              'account_id': '01411F', 'service.id': '2062-016F', 'service.description': 'Support', "cost": "0.122116", "usage_start_time": "2024-08-24 14:00:00.000000 UTC", "usage_end_time": "2024-08-24 15:00:00.000000 UTC","export_time": "2024-08-25 01:03:47.021696 UTC"
          },
          {
              'account_id': '010FD9', 'service.id': '2062-016F', 'service.description': 'Support', "cost": "0.002986", "usage_start_time":"2024-08-21 00:00:00.000000 UTC","usage_end_time":"2024-08-21 01:00:00.000000 UTC","export_time":"2024-08-21 07:17:49.309164 UTC"
          },
          {
              'account_id': '0111E8', 'service.id': '4567-001A', 'service.description': 'EC2', "cost": "0.001945", "usage_start_time":"2024-08-28 17:00:00.000000 UTC","usage_end_time":"2024-08-28 18:00:00.000000 UTC","export_time":"2024-08-29 03:40:11.062982 UTC"
          },
          {
              'account_id': '0123FE', 'service.id': '2062-016F', 'service.description': 'Support', "cost": "0.496863", "usage_start_time":"2024-08-01 01:00:00.000000 UTC","usage_end_time":"2024-08-01 02:00:00.000000 UTC","export_time":"2024-08-01 12:02:19.080977 UTC"
          },
          {
              'account_id': '01097B', 'service.id': '4567-001A', 'service.description': 'EC2', "cost": "0.522116", "usage_start_time":"2024-08-11 17:00:00.000000 UTC","usage_end_time":"2024-08-11 18:00:00.000000 UTC","export_time":"2024-08-12 02:12:45.602702 UTC"
          },
      ])
      | 'Filter EC2' >> beam.Filter(is_ec2)
      | beam.Map(print))

In [None]:
#Filtering with multiple arguments

def has_service(cdr, service):
  return cdr['service.description'] == service

with beam.Pipeline() as pipeline:
  perennials = (
      pipeline
      | 'cloud_cdr' >> beam.Create([
          {
              'account_id': '01411F', 'service.id': '2062-016F', 'service.description': 'Support', "cost": "0.122116", "usage_start_time": "2024-08-24 14:00:00.000000 UTC", "usage_end_time": "2024-08-24 15:00:00.000000 UTC","export_time": "2024-08-25 01:03:47.021696 UTC"
          },
          {
              'account_id': '010FD9', 'service.id': '2062-016F', 'service.description': 'Support', "cost": "0.002986", "usage_start_time":"2024-08-21 00:00:00.000000 UTC","usage_end_time":"2024-08-21 01:00:00.000000 UTC","export_time":"2024-08-21 07:17:49.309164 UTC"
          },
          {
              'account_id': '0111E8', 'service.id': '4567-001A', 'service.description': 'EC2', "cost": "0.001945", "usage_start_time":"2024-08-28 17:00:00.000000 UTC","usage_end_time":"2024-08-28 18:00:00.000000 UTC","export_time":"2024-08-29 03:40:11.062982 UTC"
          },
          {
              'account_id': '0123FE', 'service.id': '2062-016F', 'service.description': 'Support', "cost": "0.496863", "usage_start_time":"2024-08-01 01:00:00.000000 UTC","usage_end_time":"2024-08-01 02:00:00.000000 UTC","export_time":"2024-08-01 12:02:19.080977 UTC"
          },
          {
              'account_id': '01097B', 'service.id': '4567-001A', 'service.description': 'EC2', "cost": "0.522116", "usage_start_time":"2024-08-11 17:00:00.000000 UTC","usage_end_time":"2024-08-11 18:00:00.000000 UTC","export_time":"2024-08-12 02:12:45.602702 UTC"
          },
      ])
      | 'Filter EC2' >> beam.Filter(has_service, 'EC2')
      | beam.Map(print))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd drive/My\ Drive/Colab\ Notebooks/Cloud-AI-Analytics/Apache\ Beam\ -Python/data

In [None]:
!ls

In [None]:
with beam.Pipeline() as pipeline:
  students = (
      pipeline
      |"Read from text" >> beam.io.ReadFromText("cloud_export_100.txt", skip_header_lines= True)
      |"spliting the record" >> beam.Map(lambda record : record.split(','))
      |"filtering the data with PASS" >> beam.Filter(lambda record : record[2]=="Networking")
      |"Write to text" >> beam.io.WriteToText("result/networking")
  )


In [None]:
!ls ./result

In [None]:
!{('head -n 10 result/networking-00000-of-00001')}