<a href="https://colab.research.google.com/github/buaindra/gcp_utility/blob/main/gcp/data_pipeline_poc/gcp_poc_4_composer_mssql_taskflow_dynamic_tasks_geobeam/interactive_beam_dag/Interactive_Beam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Ref
1. Beam Doc: https://medium.com/@zahir.alward/part1-learning-apache-beam-programming-in-python-8f76ca80d1d0

2. Beam: https://beam.apache.org/documentation/io/developing-io-overview/

3. Beam SQL on Notebook: https://beam.apache.org/blog/beam-sql-with-notebooks/

4. Beam Python SDK: https://beam.apache.org/releases/pydoc/2.43.0/

5. Beam Programming guide: https://beam.apache.org/documentation/programming-guide/

6. Codelab for panda df: https://codelabs.developers.google.com/codelabs/dataflow-notebooks-streamingwordcount#3

7. (Aggregate) CombineGlobally:
https://beam.apache.org/documentation/transforms/python/aggregation/combineglobally/

In [None]:
!pip install --user apache-beam[interactive] apache-beam[GCP]

In [None]:
import apache_beam as beam

# get Length of the each element of the pcollection
with beam.Pipeline() as p:
    pcol = (p | beam.Create(["Indra", "Shivam"]) 
              | beam.Map(lambda x: x+"_"+str(len(x)))
    )

    pcol1 = (pcol.apply(beam.Map(print)))

print(f"pcol: {pcol}") 
print (f"type of pcol: {type(pcol)}")

In [None]:
"""
notebook Service Account Should have below roles
  1. Dataflow Worker
  2. Bigquery Data Viewer (Dataset Level)
  3. Bigquery User (Project Level)
"""

import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions 
from apache_beam.options.pipeline_options import SetupOptions, GoogleCloudOptions 
from apache_beam.runners.interactive import interactive_runner 
import apache_beam.runners.interactive.interactive_beam as ib
from apache_beam.io.gcp.bigquery import ReadFromBigQuery 
import pandas as pd
import json

bq_project_id = "sample-sbx-toc" 
bq_dataset = "examples"
bq_table = "FLD_HAZ_AR"

## Beam PTransform Function
class CountPer_Column(beam.PTransform):
    def init_(self, column): 
        self.column = column

    def expand(self, pcol): 
        out_pcol = (pcol
                    | f"Add {self.column} as Key" >> beam.Map(lambda x: (x.get(self.column, "None"), x))
        )
        return out_pcol | f"Count per {self.column}" >> beam.combiners.Count.PerKey()

## Beam Pardo Function
class Converted_Dict_To_Dataframe(beam.DoFn):
    def process(self, element):
    # print(f"type of element: {type(element)}") 
    ## convert JSON to DataFrame by using json_normalize(), read_json() and Dataframe.from_dict() 
    row_json_str = json.dumps(element) 
    row_df = pd.read_json(row_json_str, orient="index")
    yield row_df

## python function for beam.Map 
def _convertValues(element):
    if str(element[1]) == str(-9999.0):
        return (element[0], "0.0")
    else:
        return (element[0], element[1])

## python function for beam. CombinePerkey
def _max(value_list):
    float_values = [float(value) for value in value_list]
    return max(float_values)

##pipeline options not require for interactive beam 
pipeline_options = PipelineOptions()
pipeline_options.view_as(SetupOptions).save_main_session = True

## beam pipeline
# pipeline = beam.Pipeline(interactive_runner.InteractiveRunner(), options=pipeline_options)
pipeline = beam.Pipeline(interactive_runner.InteractiveRunner())
pcol_bq_data = (pipeline 
                | "Read From Bigquery" >> ReadFromBigQuery(
                      method = ReadFromBigQuery.Method.EXPORT,
                      #table = f"{bq_project_id}:{bq_dataset}.{ba table}", 
                      project = "{bq_project_id}",
                      dataset = "{bq_dataset}",
                      query= """SELECT * EXCEPT(geom) 
                      FROM `{bq_project_id}.{bq_dataset}.{bq_table}`; 
                      """,
                      coder = beam.io.gcp.bigquery_read_internal._JsonToDictCoder, 
                      use_standard_sql = True, 
                      query_priority = beam.io.gcp.bigquery.BigQueryQueryPriority.BATCH, 
                      output_type="PYTHON_DICT", 
                      gcs_location="gs://tmp_geobeam bucket_1/bq_gcs/")
                )

## Combine Globally
pcol_total_row_count = (pcol_bq_data | "Total Row Count" >> beam.combiners.Count.Globally())

#Use beam.combiners.Count.Perkey() with PTransform
pcol_count_per_SOURCE_CIT = (pcol_bq_data | CountPer_Column(column="SOURCE_CIT"))

## without PTransform
pcol_count_per_ZONE_SUBTY = (pcol_bq_data 
                             | beam.Map(lambda x: (x.get("ZONE_SUBTY", "None"), x.get("DEPTH", 0))) 
                             | beam.Map(_convertValues)
                            )

## Use beam.CombinePerkey
pcol_maxcount_per_ZONE_SUBTY = (pcol_count_per_ZONE_SUBTY | beam.CombinePerkey(_max))

## converted to Pandas Dataframe
pcol_df = (pcol_bq_data | "Transform Each Row" >> beam.Pardo(Converted_Dict_To_Dataframe()))


result = pipeline.run()
Job_status = result.wait_until_finish()

print(f"result: {result}") # return interactive_runner.PipelineResult object
print(f"job_status: {job_status}")  # return None

In [None]:
import apache_beam as beam
import apache_beam.runners.interactive.interactive_beam as ib

ib.show(pcol_total_row_count)
ib.show(pcol_count_per_SOURCE_CIT)
ib.show(pcol_count_per_ZONE_SUBTY.apply(beam.Filter(lambda x: x[0] != None)))
ib.show(pcol_maxcount_per_ZONE_SUBTY)