## Apache Beam
> Apache beam official documentation: https://beam.apache.org/documentation/io/built-in/



In [None]:
# Apache Beam, Python package
# apache_beam[gcp]
!pip install apache_beam[interactive]

In [None]:
# python package for dicom data processing
!pip install pydicom

### Sample data downloaded from **Kaggle**
##### Ref for how to setup kaggle access token with colab: https://www.kaggle.com/general/74235
> Kaggle Site:  
1. Dicom Data: https://www.kaggle.com/carlossalazar/dicomfolders
>
>

In [None]:
!pip install kaggle

In [None]:
# upload the kaggle.json (accen token)
from google.colab import files
files.upload()

In [None]:
#!rm -r ~/.kaggle

In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
#!kaggle datasets list 

In [None]:
!kaggle datasets download carlossalazar/dicomfolders

In [None]:
!mkdir /content/sample_data/dicom_input;
!unzip /content/dicomfolders.zip;

In [None]:
!mv /content/Encapsulated1.dcm /content/sample_data/dicom_input/
!mv /content/Encapsulated2.dcm /content/sample_data/dicom_input/
!mv /content/Encapsulated3.dcm /content/sample_data/dicom_input/
!mv /content/Encapsulated5.dcm /content/sample_data/dicom_input/

### Dicom Data process

In [None]:
# import the apache beam modules
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io import fileio

# create the beam pipeline options
pipeline_option = PipelineOptions()

> sample csv file read and using read_utf8
>
> create pipeline and individual pcollection

In [None]:
# sample csv file read
pipe = beam.Pipeline(options = pipeline_option)

pcol_file_pattern = (pipe | fileio.MatchFiles(file_pattern="/content/sample_data/california_housing_test.csv"))
pcol_file_readmatches = (pcol_file_pattern | fileio.ReadMatches())
pcol_read_utf8 = (
    pcol_file_readmatches | beam.Map(lambda x: x.read_utf8())
)
pcol_print = (pcol_read_utf8 | beam.Map(print))

pipe.run()

> error for can't convert dicom file to utf-8 format
>
``` python3
UnicodeDecodeError: 'utf-8 [while running '[14]: Map(<lambda at <ipython-input-14-b5478eabd99b>:6>)']' codec can't decode byte 0x88 in position 140: invalid start byte
```

In [None]:
# Not able to read the dicom file utf-8 format
with beam.Pipeline(options=pipeline_option) as dicom_pipeline:
  pcol01 = (
      dicom_pipeline | fileio.MatchFiles(file_pattern="/content/sample_data/dicom_input/*.dcm")
                     | fileio.ReadMatches()
                     | beam.Map(lambda x: x.read_utf8())
                     | beam.Map(print)
  )

In [None]:
# Dicom Data Dictionary
from pydicom.datadict import DicomDictionary

for key, value in DicomDictionary.items():
  print("key: {} value: {}".format(key, value))

> Read and print/write Dicom Metadata
>
> **ParDo, DoFn** and **Map function**

In [None]:
import json
import pydicom
from pydicom.datadict import DicomDictionary

class ReadDicomToJson(beam.DoFn):
  def process(self, element):
    #print(element.metadata.path)
    dicom_data = pydicom.dcmread(element.metadata.path)
    dicom_json = dicom_data.to_json_dict()
    yield dicom_json

class DicomJson_Transformation(beam.DoFn):
  def process(self, element):
    new_dicom_json = {}
    for key, value in element.items():
      for dkey, dvalue in DicomDictionary.items():
        column = "_"+ str(key) + "_" +value.get("vr", "")
        if int(key, 16) == dkey:
          column = str(dvalue[4])
          break
      data = value.get("Value", "")
      if type(data) == list and len(data) > 1:
        row = str(data)
      elif type(data) == list and len(data) == 1:
        row = str(data[0])
      else:
        row = ""
      new_dicom_json[column] = row
    yield str(new_dicom_json)

def element_print(element):
  print(element)


with beam.Pipeline(options=pipeline_option) as dicom_pipeline:
  pcol01 = (
      dicom_pipeline | fileio.MatchFiles(file_pattern="/content/sample_data/dicom_input/Encapsulated5.dcm")
                     | fileio.ReadMatches()
                     | beam.ParDo(ReadDicomToJson())
                     | beam.ParDo(DicomJson_Transformation())
                     #| beam.Map(element_print)
                     | fileio.WriteToFiles("/content/sample_data/dicom_out")
  )

> Basic transformation **Map, Flatmap, Filter**
>
> Create **in-memory data** in beam

In [None]:
emp_data_01 = {
    "name": "Indranil"
    , "company": "HCL"
    , "experience": 7
    , "tech": ["python", "composer", "dataflow"]
}

emp_data_02 = {
    "name": "Mayank"
    , "company": "HCL"
    , "experience": 17
    , "tech": ["python", "composer", "dataflow", "go"]
}

emp_data_03 = {
    "name": "Raj"
    , "company": "Google"
    , "experience": 15
    , "tech": ["python", "composer", "dataflow", "go"]
}

memory_data = []
memory_data.append(emp_data_01)
memory_data.append(emp_data_02)
memory_data.append(emp_data_03)

print(memory_data)


In [None]:
from apache_beam.typehints.typehints import Iterable
pipe1 = beam.Pipeline()
pcol1 = (pipe1 | beam.Create(memory_data))
pcol2 = (pcol1 | beam.Map(lambda x: x["company"] == "HCL"))
pcol3 = (pcol2 | beam.Map(lambda x: print(x)))

pipe1.run()

In [None]:
from apache_beam.typehints.typehints import Iterable
pipe1 = beam.Pipeline()
pcol1 = (pipe1 | beam.Create(memory_data))
pcol2 = (pcol1 | beam.Filter(lambda x: x["company"] == "HCL"))
pcol3 = (pcol2 | beam.Map(lambda x: print(x)))

pipe1.run()