<a href="https://colab.research.google.com/github/svetakvsundhar/beam/blob/colab_testing_example/examples/notebooks/blog/unittests_in_beam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the "License")

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License

In [4]:
# Install the Apache Beam library
!pip install apache_beam[gcp] --quiet

**Example 1**

In [None]:
#The following packages are used to run the example pipelines

import apache_beam as beam
from apache_beam.io import ReadFromText, WriteToText
from apache_beam.options.pipeline_options import PipelineOptions

class MyDoFn(beam.DoFn):
  def process(self,element):
          ...
          # returned_record = MyApiCall.get_data("http://my-api-call.com")
          ...
          # if len(returned_record)!=10:
            # raise ValueError("Length of record does not match expected length")
          # return y

with beam.Pipeline() as p:
  result = (
          p
          | ReadFromText("/content/sample_data/anscombe.json")
          | beam.ParDo(MyDoFn())
          | WriteToText("/content/example1")
  )

**Mocking Example**

In [None]:
!pip install mock  # Install the 'mock' module

In [7]:
# We import the mock package for mocking functionality.
from unittest.mock import Mock,patch
# from MyApiCall import get_data
import mock


# MyApiCall is a function that calls get_data to fetch some data via an API call.
@patch('MyApiCall.get_data')
def test_error_message_wrong_length(self, mock_get_data):
 response = ['field1','field2']
 mock_get_data.return_value = Mock()
 mock_get_data.return_value.json.return_value=response

 input_elements = ['-122.050000,37.370000,27.000000,3885.000000,661.000000,1537.000000,606.000000,6.608500,344700.000000'] #input length 9
 with self.assertRaisesRegex(ValueError,
                             "Length of record does not match expected length'"):
     p = beam.Pipeline()
     result = p | beam.create(input_elements) | beam.ParDo(MyDoFn())
     result


**Example 2**


In [8]:
# The following code computes the median house value per bedroom
def median_house_value_per_bedroom(element):
  # median_house_value is at index 8 and total_bedrooms is at index 4
  element = element.strip().split(',')
  return float(element[8])/float(element[4])


with beam.Pipeline() as p2:
    result = (
        p2
        | ReadFromText("/content/sample_data/california_housing_test.csv",skip_header_lines=1)
        | beam.Map(median_house_value_per_bedroom)
        | WriteToText("/content/example2")
    )

**Example 3**

In [9]:
import random
# The following code computes the median house value per bedroom
counter=-1 #define a counter globally


def median_house_value_per_bedroom(element):
  # median_house_value is at index 8 and total_bedrooms is at index 4, all as part of they key "1".
  global counter
  element = element.strip().split(',')
  # Create multiple keys based on different fields
  keys = [1,2,3]
  counter+=1
  value = float(element[8]) / float(element[4])  # Calculate median house value per bedroom
  return keys[counter%3],value

def multiply_by_factor(element):
  key,value=element
  return (key,value*10)


with beam.Pipeline() as p3:
    result = (
        p3
        | ReadFromText("/content/sample_data/california_housing_test.csv",skip_header_lines=1)
        | beam.Map(median_house_value_per_bedroom)
        | beam.Map(multiply_by_factor)
        | beam.CombinePerKey(sum)
        | WriteToText("/content/example3")
    )


Refactoring of the above

In [None]:
def transform_data_set(pcoll):
  return (pcoll
          | beam.Map(median_house_value_per_bedroom)
          | beam.Map(multiply_by_factor)
          | beam.CombinePerKey(sum))

# Define a new class that inherits from beam.PTransform
class MapAndCombineTransform(beam.PTransform):
  def expand(self, pcoll):
    return transform_data_set(pcoll)

with beam.Pipeline() as p3:
   result = (
       p3
       | ReadFromText("/content/sample_data/california_housing_test.csv",skip_header_lines=1)
       | MapAndCombineTransform() # Use the new PTransform class
       | WriteToText("/content/example3")
   )

**Unit Test for Pipeline 3**

In [12]:
import unittest
import apache_beam as beam
from apache_beam.testing.test_pipeline import TestPipeline
from apache_beam.testing.util import assert_that, equal_to


class TestBeam(unittest.TestCase):

# This test corresponds to example 3, and is written to confirm the pipeline works as intended.
  def test_transform_data_set(self):
    expected=[(1, 10570.185786231425), (2, 13.375337533753376), (3, 13.315649867374006)]
    input_elements = [
      '-122.050000,37.370000,27.000000,3885.000000,661.000000,1537.000000,606.000000,6.608500,344700.000000',
      '121.05,99.99,23.30,39.5,55.55,41.01,10,34,74.30,91.91',
      '122.05,100.99,24.30,40.5,56.55,42.01,11,35,75.30,92.91',
      '-120.05,39.37,29.00,4085.00,681.00,1557.00,626.00,6.8085,364700.00'
    ]
    with beam.Pipeline() as p3:
      result = (
                p3
                | beam.Create(input_elements)
                | beam.Map(MapAndCombineTransform())
        )
      assert_that(result,equal_to(expected))