In [1]:
# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the "License")

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License

# Scale data using `MLTransform`

<table align="left">
  <td>
    <a target="_blank" href="https://colab.sandbox.google.com/github/apache/beam/blob/master/examples/notebooks/beam-ml/mltransform/scale_data.ipynb.ipynb"><img src="https://raw.githubusercontent.com/google/or-tools/main/tools/colab_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/mltransform/scale_data.ipynb"><img src="https://raw.githubusercontent.com/google/or-tools/main/tools/github_32px.png" />View source on GitHub</a>
  </td>
</table>


In [None]:
! git clone https://github.com/apache/beam.git
! cd beam/sdks/python
! pip install beam/sdks/python
! pip install tensorflow-transform --quiet

Use this notebook to scale an entire dataset by using scaling data processing transformations, such as the following transforms:

* `ScaleTo01`: Calculates the minimum and maximum of an entire dataset, and then scales the dataset between 0 and 1 based on minimum and maximum values.
* `ScaleToZScore`: Calculates the mean and variance of an entire dataset, and then scales the dataset based on those values.
* `ScaleByMinMax`: Scales the data in a dataset, taking minimum and maximum values as input parameters.

For each data processing transform, `MLTransform` runs in both `write` mode and `read` mode.

## MLTransform in write mode

In write mode, `MLTransform` generates artifacts such as the `min` and `max` values of the entire dataset. It then uses these generated artifacts to scale the entire dataset. This workflow is useful for data that you plan to use to train an ML model.

## MLTransform in read mode

In read mode, `MLTransform` uses the artifacts generated in `write` mode to scale the entire dataset.

For more information about using `MLTransform`, see [Preprocess data with MLTransform](https://beam.apache.org/documentation/ml/preprocess-data/) in the Apache Beam documentation.

## Import the required modules

To use `MLTransfrom`, install `tensorflow_transform` and the Apache Beam SDK version 2.53.0 or later.

In [3]:
import os
import tempfile
import apache_beam as beam
from apache_beam.ml.transforms.base import MLTransform
from apache_beam.ml.transforms.tft import ScaleTo01
from apache_beam.ml.transforms.tft import ScaleByMinMax
from apache_beam.ml.transforms.tft import ScaleToZScore

In [4]:
artifact_location_scale_to_01 = tempfile.mkdtemp(prefix='scale_to_01_')
artifact_location_scale_to_zscore = tempfile.mkdtemp(prefix='scale_to_zscore_')
artifact_location_scale_by_min_max = tempfile.mkdtemp(prefix='scale_by_min_max_')

In [5]:
data = [
    {'int_feature_1' : 11, 'int_feature_2': -10},
    {'int_feature_1': 34, 'int_feature_2': -33},
    {'int_feature_1': 5, 'int_feature_2': -63},
    {'int_feature_1': 12, 'int_feature_2': -38},
    {'int_feature_1': 32, 'int_feature_2': -65},
    {'int_feature_1': 63, 'int_feature_2': -21},
]


test_data = [
    {'int_feature_1': 29, 'int_feature_2': -20},
    {'int_feature_1': -5, 'int_feature_2': -11},
    {'int_feature_1': 5, 'int_feature_2': -44},
    {'int_feature_1': 29, 'int_feature_2': -12},
    {'int_feature_1': 20, 'int_feature_2': -53},
    {'int_feature_1': 70, 'int_feature_2': -8}
]


### Scale the data between 0 and 1

Scale the data so that it's in the range of 0 and 1. To scale the data, the transform calculates `minimum` and `maximum` values on the whole dataset, and then performs the following calculation:

`x = (x - x_min) / (x_max)`

To scale the data, use the [ScaleTo01](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.transforms.tft.html#apache_beam.ml.transforms.tft.ScaleTo01) data processing transform in `MLTransform`.

In [6]:
# MLTransform in write mode.
with beam.Pipeline() as pipeline:
  data_pcoll = pipeline | "CreateData" >> beam.Create(data)

  transformed_pcoll = (
      data_pcoll
      | "MLTransform" >> MLTransform(write_artifact_location=artifact_location_scale_to_01).with_transform(
          ScaleTo01(columns=['int_feature_1', 'int_feature_2'])
      )
  )
  transformed_pcoll | "Print" >> beam.Map(print)





Row(int_feature_1=array([0.10344828], dtype=float32), int_feature_2=array([1.], dtype=float32))
Row(int_feature_1=array([0.5], dtype=float32), int_feature_2=array([0.58181816], dtype=float32))
Row(int_feature_1=array([0.], dtype=float32), int_feature_2=array([0.03636364], dtype=float32))
Row(int_feature_1=array([0.12068965], dtype=float32), int_feature_2=array([0.4909091], dtype=float32))
Row(int_feature_1=array([0.46551725], dtype=float32), int_feature_2=array([0.], dtype=float32))
Row(int_feature_1=array([1.], dtype=float32), int_feature_2=array([0.8], dtype=float32))


In [7]:
# MLTransform in read mode.
with beam.Pipeline() as pipeline:
  data_pcoll = pipeline | "CreateData" >> beam.Create(test_data)

  transformed_pcoll = (
      data_pcoll
      | "MLTransform" >> MLTransform(read_artifact_location=artifact_location_scale_to_01)
  )
  transformed_pcoll | "Print" >> beam.Map(print)



Row(int_feature_1=array([0.41379312], dtype=float32), int_feature_2=array([0.8181818], dtype=float32))
Row(int_feature_1=array([-0.1724138], dtype=float32), int_feature_2=array([0.9818182], dtype=float32))
Row(int_feature_1=array([0.], dtype=float32), int_feature_2=array([0.38181818], dtype=float32))
Row(int_feature_1=array([0.41379312], dtype=float32), int_feature_2=array([0.96363634], dtype=float32))
Row(int_feature_1=array([0.25862068], dtype=float32), int_feature_2=array([0.21818182], dtype=float32))
Row(int_feature_1=array([1.1206896], dtype=float32), int_feature_2=array([1.0363636], dtype=float32))


### Scale the data by using the z-score

Similar to `ScaleTo01`, use [ScaleToZScore](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.transforms.tft.html#apache_beam.ml.transforms.tft.ScaleToZScore) to scale the values by using the [z-score]([z-score](https://www.tensorflow.org/tfx/transform/api_docs/python/tft/scale_to_z_score#:~:text=Scaling%20to%20z%2Dscore%20subtracts%20out%20the%20mean%20and%20divides%20by%20standard%20deviation.%20Note%20that%20the%20standard%20deviation%20computed%20here%20is%20based%20on%20the%20biased%20variance%20(0%20delta%20degrees%20of%20freedom)%2C%20as%20computed%20by%20analyzers.var.).


In [8]:
# MLTransform in write mode.
with beam.Pipeline() as pipeline:
  data_pcoll = pipeline | "CreateData" >> beam.Create(data)

  transformed_pcoll = (
      data_pcoll
      | "MLTransform" >> MLTransform(write_artifact_location=artifact_location_scale_to_zscore).with_transform(
          ScaleToZScore(columns=['int_feature_1', 'int_feature_2'])
      )
  )
  transformed_pcoll | "Print" >> beam.Map(print)



Row(int_feature_1=array([-0.76950264], dtype=float32), int_feature_2=array([1.401755], dtype=float32))
Row(int_feature_1=array([0.3974355], dtype=float32), int_feature_2=array([0.2638597], dtype=float32))
Row(int_feature_1=array([-1.0739213], dtype=float32), int_feature_2=array([-1.2203515], dtype=float32))
Row(int_feature_1=array([-0.7187662], dtype=float32), int_feature_2=array([0.01649117], dtype=float32))
Row(int_feature_1=array([0.2959626], dtype=float32), int_feature_2=array([-1.3192989], dtype=float32))
Row(int_feature_1=array([1.8687923], dtype=float32), int_feature_2=array([0.8575442], dtype=float32))


In [9]:
# MLTransform in read mode.
with beam.Pipeline() as pipeline:
  data_pcoll = pipeline | "CreateData" >> beam.Create(test_data)

  transformed_pcoll = (
      data_pcoll
      | "MLTransform" >> MLTransform(read_artifact_location=artifact_location_scale_to_zscore)
  )
  transformed_pcoll | "Print" >> beam.Map(print)



Row(int_feature_1=array([0.14375328], dtype=float32), int_feature_2=array([0.9070179], dtype=float32))
Row(int_feature_1=array([-1.5812857], dtype=float32), int_feature_2=array([1.3522812], dtype=float32))
Row(int_feature_1=array([-1.0739213], dtype=float32), int_feature_2=array([-0.28035107], dtype=float32))
Row(int_feature_1=array([0.14375328], dtype=float32), int_feature_2=array([1.3028076], dtype=float32))
Row(int_feature_1=array([-0.31287467], dtype=float32), int_feature_2=array([-0.7256144], dtype=float32))
Row(int_feature_1=array([2.2239475], dtype=float32), int_feature_2=array([1.5007024], dtype=float32))


### Scale the data by using ScaleByMinMax

Use  [ScaleByMinMax](https://github.com/apache/beam/blob/9e8a310f0c0faddfba28176df5893d8ad8fd10a0/sdks/python/apache_beam/ml/transforms/tft.py#L450) to scale your data into the range of `[min_value, max_value]`.

In [10]:
min_value = 1
max_value = 10

# MLTransform in write mode.
with beam.Pipeline() as pipeline:
  data_pcoll = pipeline | "CreateData" >> beam.Create(data)

  transformed_pcoll = (
      data_pcoll
      | "MLTransform" >> MLTransform(write_artifact_location=artifact_location_scale_by_min_max).with_transform(
          ScaleByMinMax(columns=['int_feature_1', 'int_feature_2'], min_value=min_value, max_value=max_value)
      )
  )
  transformed_pcoll | "Print" >> beam.Map(print)



Row(int_feature_1=array([1.9310346], dtype=float32), int_feature_2=array([10.], dtype=float32))
Row(int_feature_1=array([5.5], dtype=float32), int_feature_2=array([6.2363634], dtype=float32))
Row(int_feature_1=array([1.], dtype=float32), int_feature_2=array([1.3272727], dtype=float32))
Row(int_feature_1=array([2.086207], dtype=float32), int_feature_2=array([5.418182], dtype=float32))
Row(int_feature_1=array([5.1896553], dtype=float32), int_feature_2=array([1.], dtype=float32))
Row(int_feature_1=array([10.], dtype=float32), int_feature_2=array([8.200001], dtype=float32))


In [11]:
# MLTransform in read mode.
with beam.Pipeline() as pipeline:
  data_pcoll = pipeline | "CreateData" >> beam.Create(test_data)

  transformed_pcoll = (
      data_pcoll
      | "MLTransform" >> MLTransform(read_artifact_location=artifact_location_scale_by_min_max)
  )
  transformed_pcoll | "Print" >> beam.Map(print)



Row(int_feature_1=array([4.7241383], dtype=float32), int_feature_2=array([8.363636], dtype=float32))
Row(int_feature_1=array([-0.5517242], dtype=float32), int_feature_2=array([9.836364], dtype=float32))
Row(int_feature_1=array([1.], dtype=float32), int_feature_2=array([4.4363637], dtype=float32))
Row(int_feature_1=array([4.7241383], dtype=float32), int_feature_2=array([9.672727], dtype=float32))
Row(int_feature_1=array([3.3275862], dtype=float32), int_feature_2=array([2.9636364], dtype=float32))
Row(int_feature_1=array([11.086206], dtype=float32), int_feature_2=array([10.327272], dtype=float32))
