<a href="https://colab.research.google.com/github/buaindra/cloud-learning-with-python/blob/dev01/HealthcareAPI_Dicom_Beam_Composer_Python/DicomBeamComposer_Integration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Configuration file (**dicom_config.properties**)
> location: /home/airflow/gcs/data/dicom_config.properties

``` python3
[QA]
project_id =
sa_key_file = gs://<bucket name>/<prefix>/<key-json file> 
gcs_python_bq_to_dw = gs://<bucket name>/<prefix>/<py file> 
setup_file = /home/airflow/gcs/dags/<foldername>/setup.py
extra_package = /home/airflow/gcs/dags/<foldername>/<packagename-version_name-.tar.gz>
```

## Google Dataflow

### Beam Pipeline Code in Python

#### How to pass sa key json (gcs path) as side input to pardo function
#### Healthcare API import/export call with side_input, pardo

In [None]:
# gcs to hc
import json
import argparse
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions, SetupOptions, GoogleCloudOptions
from apache_beam.options.pipeline_options import StandardOptions, DebugOptions, DirectOptions
from apache_beam.options.pipeline_options import WorkerOptions
import configparser

# take variables values from configuration file
env = "QA"
config = configparser.ConfigParser()
config.read("/home/airflow/gcs/data/dicom_config.properties")

input_dict = {}
input_dict["project_id"] = config[env]["project_id"]
input_dict["region"] = config[env]["region"]
input_dict["hc_dataset_id"] = config[env]["hc_dataset_id"]
input_dict["hc_dicom_store_id"] = config[env]["hc_dicom_store_id"]
input_dict["dicom_gcs_bucket"] = config[env]["dicom_gcs_bucket"]
input_dict["dicom_gcs_prefix"] = config[env]["dicom_gcs_prefix"]

def get_list_bucket_prefix(bucket_nm, prefix_nm):
  from google.cloud import storage

  delimiter = "/"
  dicom_wildcard = "**.dcm"
  storage_client = storage.Client(project=input_dict["project_id"])
  bucket = storage_client.get_bucket(bucket_nm)
  iterator = bucket.list_blobs(delimiter=delimiter, prefix=prefix_nm)
  list(iterator) # Need for iteration
  for prefix in iterator.prefixes:
    out = f"gs://{bucket_nm}/{prefix}{dicom_wildcard}"
    yield out

class ImportToDicomStore(beam.DoFn):
  #def __init__(self, log):
  #  self.log = log :
  #  super(self.__class__, self).__init__() :
  def process(self, dc_element, hc_element, creds):
    from googleapiclient import discovery
    from google.oauth2.service_account import Credentials
    import json
    import time

    time.sleep(5) # to reduce the api call per mint/per user/per region

    credentials_json = json.loads("\n".join(creds))
    #credentials = Credentials.from_service_account_file(cw_sa_key_file) # local relative path of the json key file
    credentials = Credentials.from_service_account_info(credentials_json)
    hc_client = discovery.build(serviceName="healthcare", version="v1", credentials=credentials)
    dicom_store_name = "projects/{}/locations/{}/datasets/{}/dicomStores/{}".format(
        hc_element["project_id"], hc_element["region"], hc_element["hc_dataset_id"], hc_element["hc_dicom_store_id"]
    )
    body = {"gcsSource": {"uri": "{}".format(dc_element)}}
    request_dicom_import = (
        hc_client.projects()
        .locations()
        .datasets()
        .dicomStores()
        .import_(name=dicom_store_name, body=body)
    )
    response_dicom_import = request_dicom_import.execute()
    yield response_dicom_import


def run(argv=None):
  # setup beam argument
  parser = argparse.ArgumentParser()
  parser.add_argument("--sa_key",
                      dest="sa_key",
                      required=True,
                      help="service account credentials json"    
  )
  args, beam_args = parser.parse_known_args(argv)
  pipeline_options = PipelineOptions(beam_args)

  google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
  google_cloud_options.project = config[env]["project"]
  google_cloud_options.region = config[env]["region"]
  google_cloud_options.job_name = config[env]["job_name"]
  google_cloud_options.staging_location = config[env]["staging_location"]
  google_cloud_options.temp_location = config[env]["temp_location"]
  google_cloud_options.service_account_email = config[env]["service_account_email"]

  pipeline_options.view_as(DebugOptions).experiments = ["use_unsupported_python_version"]
  pipeline_options.view_as(StandardOptions).runner = "DataflowRunner" # "DirectRunner" 
  #pipeline_options.view_as(DirectOptions).direct_num_workers = 3 
  pipeline_options.view_as(WorkerOptions).num_workers = int(config[env]["num_workers"])
  pipeline_options.view_as(WorkerOptions).max_num_workers = int(config[env]["max_num_workers"])
  #pipeline_options.view_as(WorkerOptions).autoscaling_algorithm = "THROUGHPUT_BASED"
  pipeline_options.view_as(SetupOptions).save_main_session = True
  #pipeline_options.view_as(SetupOptions).setup_file = "./setup.py"
  #pipeline_options.view_as(SetupOptions).extra_packages = ["dist/<packageName-0.1.0.tar.gz"]

  #debug
  import os
  p = os.path.realpath(__file__)
  print("beam pipeline path:", p)

  # pipeline obj initiated
  with beam.Pipeline(options=pipeline_options) as pipeline:
    pc_hc_conf =( pipeline | "Load HC Config" >> beam.Create([input_dict]))
    pc_credentials = ( pipeline | "Read Credentials from GCS" >> beam.io.textio.ReadFromText(args.sa_key))
    pc_main = (
        pipeline | "Added Dicom Input Lists" >> beam.Create(get_list_bucket_prefix(input_dict["dicom_gcs_bucket"], input_dict["dicom_gcs_prefix"]))
                 | "HC Import API" >> beam.ParDo(ImportToDicomStore(), beam.pvalue.AsSingletone(pc_hc_conf), beam.pvalue.AsList(pc_credentials))
                 | "Print Output" >> beam.Map(print)
    )

if __name__ == "__main__":
  run()

In [None]:
# hc to bq
from apache_beam.io.gcp.bigquery import BigQueryDisposition

class Export_HC_Instance(beam.DoFn):
  def process(self, element, creds):
    from googleapiclient import discovery
    from google.oauth2.service_account import Credentials
    import json

    credentials_json = json.loads("\n".join(creds))
    credentials = Credentials.from_service_account_info(credentials_json)

    hc_client = discovery.build(serviceName="healthcare", version="v1", credentials=credentials)

    dicom_store_name = "projects/{}/locations/{}/datasets/{}/dicomStores/{}".format(
        element["project_id"], element["region"], element["hc_dataset_id"], element["hc_dicom_store_id"]
    )

    body = {"bigqueryDestination": {"tableUri": "bq://{}.{}.{}".format(element["project_id"], element["bq_dataset"], element["bq_table"]), "writeDisposition": BigQueryDisposition.WRITE_APPEND}}
    #body = {"gcsDestination": {"uriPrefix": "gs://<bucket_name>/<sub_folder>"}}
    
    request = (
        hc_client.projects()
        .locations()
        .datasets()
        .dicomStores()
        .export(name=dicom_store_name, body=body)
    )
    response = request.execute()
    yield response

### setup.py and extra_packages call from beam pipeline

In [None]:
# bq to dw
import json
import argparse
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions, SetupOptions, GoogleCloudOptions
from apache_beam.options.pipeline_options import StandardOptions, DebugOptions, DirectOptions
from apache_beam.options.pipeline_options import WorkerOptions
import configparser

# take variables values from configuration file
env = "QA"
config = configparser.ConfigParser()
config.read("/home/airflow/gcs/data/dicom_config.properties")

project_number = config[env]["project_number"]
location = config[env]["location"]
bq_table = config[env]["bq_table"]
bq_dataset = config[env]["bq_dataset"]
bq_project = config[env]["bq_project"]

class ConvertToJsonText(beam.DoFn):
  def process(self, element):
    dict_element = json.dumps(dict(element), indent=2, default=str)
    doc_name = element["SOPInstanceUID"].replace(".","_")
    yield {"dicom_element": dict_element, "doc_name": str(doc_name)}

class ImportToContentWarehouse(beam.DoFn):
  def process(self, element):
    from healthcare_lib import doc_lib
    doc_name = "dicom_" + element["doc_name"] + ".txt"
    dicom_json_text = str(element["dicom_element"])
    schema_json_text = ""

    returned_schemas = doc_lib.return_schemes_names(
        project_number=project_number, location=location, display_name="dicom"
    )

    if not returned_schemas:
      document_schema = doc_lib.upload_document_schema(
          project_number=project_number, location=location, schema_name=bq_table, schema_json_text=schema_json_text, endpoint_override="" 
      )
    else:
      document_schema = returned_schemas[0]

    doc_lib.upload_document(
        project_number=project_number, location=location, endpoint_override="", document_schema=document_schema,
        doc_name = doc_name, doc_json_text=dicom_json_text
    )
    yield doc_name


def run(argv=None):
  # setup beam argument
  parser = argparse.ArgumentParser()
  args, beam_args = parser.parse_known_args(argv)
  pipeline_options = PipelineOptions(beam_args)

  google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
  google_cloud_options.project = config[env]["project"]
  google_cloud_options.region = config[env]["region"]
  google_cloud_options.job_name = config[env]["job_name"]
  google_cloud_options.staging_location = config[env]["staging_location"]
  google_cloud_options.temp_location = config[env]["temp_location"]
  google_cloud_options.service_account_email = config[env]["service_account_email"]

  pipeline_options.view_as(DebugOptions).experiments = ["use_unsupported_python_version"]
  pipeline_options.view_as(StandardOptions).runner = "DataflowRunner" # "DirectRunner" 
  #pipeline_options.view_as(DirectOptions).direct_num_workers = 3 
  pipeline_options.view_as(WorkerOptions).num_workers = int(config[env]["num_workers"])
  pipeline_options.view_as(WorkerOptions).max_num_workers = int(config[env]["max_num_workers"])
  #pipeline_options.view_as(WorkerOptions).autoscaling_algorithm = "THROUGHPUT_BASED"
  pipeline_options.view_as(SetupOptions).save_main_session = True
  #pipeline_options.view_as(SetupOptions).setup_file = "./setup.py"
  #pipeline_options.view_as(SetupOptions).extra_packages = ["dist/<packageName-0.1.0.tar.gz"]

  #debug
  import os
  p = os.path.realpath(__file__)
  print("beam pipeline path:", p)

  # pipeline obj initiated
  with beam.Pipeline(options=pipeline_options) as pipeline:

    pc = (
        pipeline | "Read from BQ" >> beam.io.ReadFromBigQuery(
                      table = bq_table,
                      dataset = bq_dataset,
                      project = bq_project)
                 | "Convert to JsonText" >> beam.ParDo(ConvertToJsonText())
                 | "Load to DW" >> beam.ParDo(ImportToContentWarehouse())
                 | "Print Output" >> beam.Map(print)
    )

if __name__ == "__main__":
  run()

## Google Composer


### setup.py

``` python3
from setuptools import setup, find_packages

setup(name = "healthcare_package",
  version="0.2.0",
  packages=find_packages(),
  install_requires=["google-cloud-storage==2.1.0", "google-cloud-documentai==1.2.1"]
)
```

#### how to call dependent python lib into beam pipeline, with setup.py

### Composer Dag Code in Python-

In [None]:
import sys
import os
from datetime import datetime, timedelta
import configparser
from airflow import DAG
from airflow import models
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.providers.apache.beam.operators.beam import BeamRunPythonPipelineOperator
import logging

log_name = "composer_job"
logger = logging.getLogger(log_name)
logger.setLevel(logging.DEBUG)

# take variables values from configuration file
#env = "QA"
env=os.environ["ENV"]
config = configparser.ConfigParser()
#config.read("/home/airflow/gcs/data/dicom_config.properties")
config.read(f"/home/airflow/gcs/data/dicom_config_{env.lower()}.properties")

project_id = config[env]["project_id"]
region = config[env]["region"]
hc_dataset_id = config[env]["hc_dataset_id"]
bq_table = config[env]["hc_dataset_id"]
bq_dataset = config[env]["hc_dataset_id"]
sa_key_file = config[env]["sa_key_file"]
gcs_python_bq_to_dw = config[env]["gcs_python_bq_to_dw"] # location of the dataflow python file gs://<bucket name>/<prefix>/<py file> 

cur_date = datetime.today() # datetime.now()
dicom_store_id = config[env]["dicom_store_id"] + "_" + cur_date.strftime("%d-%m-%Y-%H-%M-%S")

setup_file = config[env]["setup_file"]
extra_package = config[env]["extra_package"]

def create_dicom_store(project_id, location, dataset_id, dicom_store_id):
    from googleapiclient import discovery # not require explicit package installation in composer 2 airflow 2
    api_version = "v1"
    service_name = "healthcare"
    client = discovery.build(service_name, api_version)
    dicom_store_parent = "projects/{}/locations/{}/datasets/{}".format(project_id, location, dataset_id)
    request = (
        client.projects()
        .locations()
        .datasets()
        .dicomStores()
        .create(parent=dicom_store_parent, body={}, dicomStoreID=dicom_store_id)
    )
    response = request.execute()
    #context["ti"].xcom_push("dicom_store_id", response.get("name", "").split("/")[-1]) #xcom_push
    return response["name"].split("/")[-1] # will return the created dicom store name

def create_bq_column(bq_project, bq_dataset, bq_table):
    from google.cloud import bigquery
    bq_client = bigquery.Client()
    table = bq_client.get_table(f"{bq_project}.{bq_dataset}.{bq_table}")
    original_schema = table.schema
    flag = False
    for i in original_schema:
        if i.name == "ingestion_datetime":
            flag = False
            break
        else:
            flag = True
    if flag:
        new_schema = original_schema[:]
        new_schema.append(bigquery.SchemaField("ingestion_datetime", "datetime", mode="NULLABLE")) # add new column to schema
        table.schema = new_schema
        table = bq_client.update_table(table, ["schema"])

def update_bq_column(bq_project, bq_dataset, bq_table):
    from google.cloud import bigquery
    bq_client = bigquery.Client()
    bq_ingestion_dt = datetime.today()
    dml_query = (f"UPDATE `{bq_project}.{bq_dataset}.{bq_table}` SET ingestion_datetime = '{bq_ingestion_dt}' WHERE ingestion_datetime is NULL")
    query_job = bq_client.query(dml_query)
    query_job.result() # waits for the statement to finish

# specify the default argument
default_args ={
    "owner": "Airflow",
    "depends_on_past": False,
    "start_date": datetime.today(),
    "retries": 0,
    "retry_delay": timedelta(minutes=5)
}

with models.DAG(
    dag_id = "composer_job_name",
    default_args = default_args,
    schedule_interval = None, #timedelta(days=1) 
    #schedule_interval = "@daily"
) as dag:

    start = DummyOperator(
        task_id = "Start"
    )

    create_dicom_store = PythonOperator(
        task_id = "create_dicom_store",
        provide_context=True,
        python_callable = create_dicom_store, 
        op_kwargs={"project_id": project_id, "location": region, "dataset_id": hc_dataset_id, "dicom_store_id": dicom_store_id},
        dag=dag
    )

    wait_dag = BashOperator(
        task_id = "wait_dag",
        bash_command = "sleep 10s"
    )

    create_bq_column = PythonOperator(
        task_id = "create_bq_column",
        provide_context=True,
        python_callable = create_bq_column, 
        op_kwargs={"bq_project": project_id, "bq_dataset": bq_dataset, "bq_table": bq_table},
        dag=dag
    )

    update_bq_column = PythonOperator(<same as above>)

    beam_pipeline = BeamRunPythonPipelineOperator(
        task_id = "beam_pipeline",
        runner = "DataflowRunner",
        py_file = gcs_python_bq_to_dw,
        py_options = [],
        pipeline_options = {
            "setup_file": setup_file,
            "extra_package": extra_package,
            "sa_key": sa_key_file,
            "dicom_store_id": "{{ ti.xcom_pull(task_ids='create_dicom_store', key='return_value') }}",
            # jinja templat for xcom_pull, return_value is default key if you return anything from the method
        },
        py_requirements = ["apache-beam[gcp]==2.34.0", "google-cloud-storage==2.1.0", "google-api-python-client==2.39.0", "google-cloud-logging==3.0.0"],
        py_interpreter = "python3",
        dataflow_config = {
            "job_name": "dataflow_job_name",
            "wait_until_finished": True,
        }
    )

    end = DummyOperator(
        task_id = "End"
    )

    start >> beam_pipeline >> end

### Cloud Logging inside beam(dataflow) or airflow(composer)


In [None]:
import logging
import google.cloud.logging
from google.cloud.logging_v2.handlers import CloudLoggingHandler

def get_logger(log_name: str):
  gcloud_logging_client = google.cloud.logging.Client()
  gcloud_logging_handler = CloudLoggingHandler(
      gcloud_logging_client, name=log_name
  )
  #create a stream handler to log message to the console
  stream_handler = logging.StreamHandler()
  stream_handler.setLevel(logging.WARNING)
  # now create a logger and add the handlers
  logger = logging.getLogger(log_name)
  logger.setLevel(logging.DEBUG)
  logger.addHandler(gcloud_logging_handler)
  logger.addHandler(stream_handler)
  return  logger

log = get_logger("beam-pipeline")
log.info("started pipeline")


### Api call response handling

In [None]:
class checkCompletion(beam.DoFn):
    def process(self, element, creds):
        from googleapiclient import discovery
        from oauth2client.client import GoogleCredentials
        import time
        import logging
        credentials = GoogleCredentials.get_application_default()
        service = discovery.build("healthcare", "v1beta1", credentials=credentials)
        name = element["name"]
        request = service.projects().locations().datasets().operations().get(name=name)
        while True:
            response = request.execute()
            result = response.json()
            done = result.get("done", "")
            if done == True:
                break
            else:
                print("operation is in progress, wait for 30 secs and recheck..")
                time.sleep(30)
        if "error" in element:
            error = element["error"]
            if error != None and error != "":
                #sys.exit(-1)
                logging.error(error)
        return response

# pcollection = {"name": "<api response details>"}

#### BQ Append (Not worked)

In [None]:
from apache_beam.io.gcp.bigquery import BigQueryDisposition
from google.cloud import bigquery
from google.cloud.bigquery.schema import SchemaField


def get_schema(table_id):
  bq_client = bigquery.Client()
  table = bq_client.get_table(table_id)
  original_schema = table.schema
  '''
  for i in original_schema:
    if not i.is_nullable:
      print(i.name)
  with open("original_schema.txt", "w", encoding="utf-8") as f:
    f.write(str(original_schema))
  '''
  replaced_schema = create_schema(original_schema)
  return replaced_schema

def create_schema(source_schema):
  schema = []
  for i in source_schema:
    print(i.name, i.field_type, i.mode)
    if i.mode == "REPEATED":
      schemafield = SchemaField(i.name, i.field_type, mode="REPEATED")
    else:
      schemafield = SchemaField(i.name, i.field_type, mode="NULLABLE")
    schema.append(schemafield)
    if i.field_type == "RECORD":
      schemafield._fields = create_schema(i.fields)
  return str(schema)


 | "Write to BQ" >> beam.io.WriteToBigBigQuery(
     table = <>,
     dataset = <>,
     project = <>,
     schema = get_schema(project.dataset.table),
     create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
     write_disposition=BigQueryDisposition.WRITE_APPEND
)

### Debug
1. for **setup.py and extra package**, pls provide full path of the airflow location if you are calling beam pipeline from composer, else only specify relative path if you are calling beam pipeline directly without composer.
2. Also **extra package**, takes the argument as list, so please check if you specifying multiple packages, ensure to add those inside the list.
3. Also, make sure in **composer**, after dags folder, please upload empty **__init__.py** into each and every subfolders.
4. etc.

#### Folder Structure in Composer
>
> dags
>
>> composer_dag.py
>
>> dependent folder 
>
>>> __init__.py
>
>>> setup.py
>
>>> beam_gcs_to_hc.py
>
>>> healthcare_lib
>
>>>> __init__.py
>
>>>> doc_lib.py
>
>>> dist
>
>>>> __init__.py
>
>>>> package-0.1.0.tar.gz

### Ref:
#### **BeamRunPythonPipelineOperator**: 
> documentation: https://airflow.apache.org/docs/apache-airflow-providers-apache-beam/stable/_api/airflow/providers/apache/beam/operators/beam/index.html
> 
> sample code: https://airflow.apache.org/docs/apache-airflow-providers-apache-beam/stable/operators.html