# Batch Transform on Amazon SageMaker Pipelines Integrated with PrestoDB


***This notebook works best with the `Data Science 3.0` kernel on an `ml.t3.medium` instance type***.

Run the [0_model_training_pipeline](./0_model_training_pipeline.ipynb) notebook prior to running the notebook. This notebook runs a batch transform using the model trained in the previous notebook. It does so by running the following steps:

1. Extract the latest approved model from the SageMaker model registry.

1. Read raw data for inference from PrestoDB and stores in an Amazon S3 bucket.

1. Create a SageMaker pipeline with a data processing step and a batch transform step to provide inference on the data. The inference results are also stored in S3.

In [2]:
#import sys
#!{sys.executable} -m pip install -r requirements.txt

In [9]:
## Install the necessary boto3 and sagemaker libraries to initialize session
import json
import boto3
import time
import logging
import sagemaker
import sagemaker.session
from typing import Dict, List
from datetime import datetime, timedelta
from sagemaker.workflow.parameters import ParameterString
from sagemaker.workflow.pipeline_context import PipelineSession
from utils import load_config, print_pipeline_execution_summary

from sagemaker.workflow.functions import Join
from sagemaker.processing import  ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.execution_variables import ExecutionVariables

from sagemaker.model import Model
from sagemaker.inputs import CreateModelInput
from sagemaker.workflow.model_step import ModelStep
from sagemaker.transformer import Transformer

In [10]:
## set the logger to track all of the logs as this pipeline runs
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

### Load the Config.yml file that contains information that is used across this pipeline

In [11]:
config = load_config('config.yml')
logger.info(json.dumps(config, indent=2))

[2024-02-25 00:02:24,610] p6796 {2294058105.py:2} INFO - {
  "aws": {
    "region": "us-east-1",
    "sagemaker_execution_role_name": "AmazonSageMaker-ExecutionRole-20230807T175994",
    "sagemaker_execution_role_arn": "arn:aws:iam::{account_id}:role/service-role/{role}",
    "s3_bucket": "sagemaker-{region}-{account_id}",
    "s3_prefix": "mlops-pipeline-model"
  },
  "presto": {
    "host": "3.93.186.209",
    "parameter": "8080",
    "presto_credentials": "presto-credentials",
    "catalog": "tpch",
    "schema": "tiny"
  },
  "pipeline": {
    "training_pipeline_name": "mlops-pipeline-presto",
    "transform_pipeline_name": "mlops-batch-inference",
    "execution_display_name": "mlops-prestodb-pipeline",
    "base_job_name": "mlops-prestodb",
    "tags": [
      {
        "Key": "team",
        "Value": "my-team"
      }
    ]
  },
  "training_step": {
    "training_target": "high_value_order",
    "training_features": [
      "total_extended_price",
      "avg_discount",
      "to

In [12]:
## initialize the sagemaker session, region, role bucket and pipeline session
session = sagemaker.session.Session()
region = session.boto_region_name
pipeline_session = PipelineSession()

ci = boto3.client('sts').get_caller_identity()

role_name = config['aws']['sagemaker_execution_role_name']
config['aws']['sagemaker_execution_role_arn'] = config['aws']['sagemaker_execution_role_arn'].format(account_id=ci['Account'], role=role_name)
role = config['aws']['sagemaker_execution_role_arn']

bucket = config['aws']['s3_bucket'].format(account_id=ci['Account'], region=region)
prefix = config['aws']['s3_prefix']  # Prefix to S3 artifacts

logger.info(f"bucket={bucket}, prefix={prefix}, role={role}")

[2024-02-25 00:02:25,063] p6796 {751560243.py:15} INFO - bucket=sagemaker-us-east-1-218208277580, prefix=mlops-pipeline-model, role=arn:aws:iam::218208277580:role/service-role/AmazonSageMaker-ExecutionRole-20230807T175994


In [13]:
# Convert your list to a JSON string
training_features_str = json.dumps(config['training_step']['training_features'])
logger.info(f"the training features being used for this pipeline --> {training_features_str}")

# Define new pipeline parameters
host_parameter = ParameterString(name="HostParameter", default_value=config['presto']['host'])
port_parameter = ParameterString(name="PortParameter", default_value=config['presto']['parameter'])
target_parameter = ParameterString(name="Target", default_value=config['training_step']['training_target'])
feature_parameter = ParameterString(name="Feature", default_value=training_features_str)

## presto credential key and region pipeline parameters
presto_parameter = ParameterString(name="PrestoParameter", default_value=config['presto']['presto_credentials'])
region_parameter = ParameterString(name="Region", default_value=config['aws']['region'])

## represents the parameters being used to track the catalog and the schema needed to connect to the presto server
presto_catalog_parameter = ParameterString(name="Catalog", default_value=config['presto']['catalog'])
presto_schema_parameter = ParameterString(name="Schema", default_value=config['presto']['schema'])

[2024-02-25 00:02:25,073] p6796 {1620125757.py:3} INFO - the training features being used for this pipeline --> ["total_extended_price", "avg_discount", "total_quantity"]


<a id='parameters'></a>

### Pipeline input parameters

Pipeline Parameters are input parameter when triggering a pipeline execution. They need to be explicitly defined when creating the pipeline and contain default values.

Create parameters for the inputs to the pipeline. In this case, parameters will be used for:

- `ProcessingInstanceType` - What EC2 instance type to use for processing.
- `TrainingInstanceType` - What EC2 instance type to use for training.

In [14]:
from sagemaker.sklearn.processing import SKLearnProcessor

# What instance type to use for processing.
processing_instance_type = ParameterString(
    name="ProcessingInstanceType", default_value=config['data_processing_step']['processing_instance_type']
)


# Create SKlearn processor object,
# The object contains information about what instance type to use, the IAM role to use etc.
# A managed processor comes with a preconfigured container, so only specifying version is required.

est_cls = sagemaker.sklearn.estimator.SKLearn

sklearn_processor = FrameworkProcessor(
                                     estimator_cls=est_cls,
                                     framework_version=config['training_step']['sklearn_framework_version'],
                                     role=role,
                                     instance_type=processing_instance_type,
                                     instance_count=config['data_processing_step']['instance_count'],
                                     tags=config['data_processing_step']['tags'], 
                                     sagemaker_session=pipeline_session,
                                     base_job_name=config['pipeline']['base_job_name'], )



#### Create an Image URI object to use while creating the model from the approved model in the registry

In [15]:
# Fetch container to use for training
image_uri = sagemaker.image_uris.retrieve(
    framework="sklearn",
    region=config['aws']['region'],
    version=config['training_step']['sklearn_framework_version'],
    py_version="py3",
    instance_type=config['data_processing_step']['processing_instance_type'],
)
logger.info(f"processing step image_uri={image_uri}")

[2024-02-25 00:02:29,691] p6796 {2531010310.py:9} INFO - processing step image_uri=683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3


### Now, step is to approve the model
---
Finally, approve the model to launch the model deployment process

In [16]:
sm = boto3.client("sagemaker")

# list all model packages and select the latest one
model_packages = []

for p in sm.get_paginator('list_model_packages').paginate(
        ModelPackageGroupName=config['register_model_step']['model_group'],
        SortBy="CreationTime",
        SortOrder="Descending",
    ):
    model_packages.extend(p["ModelPackageSummaryList"])

if len(model_packages) == 0:
    raise Exception(f"No model package is found for {config['register_model_step']['model_group']} model package group")

## print the latest model, approve it
latest_model_package_arn = model_packages[0]["ModelPackageArn"]
logger.info(f"for model_group={config['register_model_step']['model_group']}, latest_model_package_arn={latest_model_package_arn}")

[2024-02-25 00:02:31,269] p6796 {1612686606.py:18} INFO - for model_group=mlops-presto, latest_model_package_arn=arn:aws:sagemaker:us-east-1:218208277580:model-package/mlops-presto/5


The following statement sets the ModelApprovalStatus for the model package to Approved. The model package state change will launch the EventBridge rule and the rule will launch the CodePipeline CI/CD pipeline with model deployment.

In [17]:
## updating the latest model package to approved status to use it for batch inference
model_package_update_response = sm.update_model_package(
    ModelPackageArn=latest_model_package_arn,
    ModelApprovalStatus="Approved",
)

## PART 2: Batch Transform Pipeline: Prepare Batch Data & Perform Batch Inference

### first step is to get the latest batch data from presto and use that for batch transform step

In [18]:
# Use the sklearn_processor in a SageMaker Pipelines ProcessingStep
# Configure the ProcessingStep

## represents the output processing for the batch pre processing step
batch_output=[
        ProcessingOutput(
            output_name="batch",
            source="/opt/ml/processing/batch",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    ExecutionVariables.PIPELINE_EXECUTION_ID,
                    "batch",
                ], 
            ),
        ),
    ]


# Use the sklearn_processor's run method and configure the batch preprocessing step
step_args = sklearn_processor.run(
    code=config['scripts']['batch_transform_get_data'],
    source_dir=config['scripts']['source_dir'], 
    outputs=batch_output,
    arguments=[
        "--host", host_parameter,
        "--port", port_parameter,
        "--presto_credentials_key", presto_parameter,
        "--region", region_parameter,
        "--presto_catalog", presto_catalog_parameter,
        "--presto_schema", presto_schema_parameter,
    ],
)


batch_data_prep = ProcessingStep(
    name=config['data_processing_step']['step_name'],
    step_args=step_args,
)



### Batch Transform Configuration begins below:
---

1. Create the model with the model image uri, refer to the 'inference.py' script that grabs information on features to use while making predictions.

2. Create the model which automatically triggers the training and the preprocess data step

3. Run the transformer step on the created model and 

In [19]:
client = boto3.client("sagemaker")
list_model_packages_response = client.list_model_packages(ModelPackageGroupName=config['register_model_step']['model_group'])
logger.info(f"list_model_packages_response={list_model_packages_response}")

latest_model_version_arn = list_model_packages_response["ModelPackageSummaryList"][0][
    "ModelPackageArn"
]
logger.info(f"latest_model_version_arn={latest_model_version_arn}")

[2024-02-25 00:02:38,931] p6796 {1380157791.py:3} INFO - list_model_packages_response={'ModelPackageSummaryList': [{'ModelPackageGroupName': 'mlops-presto', 'ModelPackageVersion': 5, 'ModelPackageArn': 'arn:aws:sagemaker:us-east-1:218208277580:model-package/mlops-presto/5', 'CreationTime': datetime.datetime(2024, 2, 24, 23, 57, 10, 650000, tzinfo=tzlocal()), 'ModelPackageStatus': 'Completed', 'ModelApprovalStatus': 'Approved'}, {'ModelPackageGroupName': 'mlops-presto', 'ModelPackageVersion': 4, 'ModelPackageArn': 'arn:aws:sagemaker:us-east-1:218208277580:model-package/mlops-presto/4', 'CreationTime': datetime.datetime(2024, 2, 24, 3, 9, 42, 143000, tzinfo=tzlocal()), 'ModelPackageStatus': 'Completed', 'ModelApprovalStatus': 'PendingManualApproval'}, {'ModelPackageGroupName': 'mlops-presto', 'ModelPackageVersion': 3, 'ModelPackageArn': 'arn:aws:sagemaker:us-east-1:218208277580:model-package/mlops-presto/3', 'CreationTime': datetime.datetime(2024, 2, 24, 0, 34, 23, 794000, tzinfo=tzlocal

In [20]:
try:
    latest_approved_model_package = client.describe_model_package(ModelPackageName=latest_model_version_arn)

    if latest_approved_model_package['ModelApprovalStatus'] == "Approved":
        logger.info(f"The latest approved model package is --> {latest_approved_model_package}")
        model_data_url = latest_approved_model_package['InferenceSpecification']['Containers'][0]['ModelDataUrl']
        logger.info(f"The model data for the latest approved model arn {latest_model_version_arn} is stored in {model_data_url}")
    else:
        # If the model approval status is not PendingApproval, throw an error exception
        error_message = f"ModelApprovalStatus is not PendingApproval. Current status: {latest_approved_model_package['ModelApprovalStatus']}"
        logger.error(error_message)
        raise ValueError(error_message)

except Exception as e:
    logger.error(f"An error occurred while tracking the approved model: {str(e)}")
    raise e



[2024-02-25 00:02:40,311] p6796 {226519901.py:5} INFO - The latest approved model package is --> {'ModelPackageGroupName': 'mlops-presto', 'ModelPackageVersion': 5, 'ModelPackageArn': 'arn:aws:sagemaker:us-east-1:218208277580:model-package/mlops-presto/5', 'CreationTime': datetime.datetime(2024, 2, 24, 23, 57, 10, 650000, tzinfo=tzlocal()), 'InferenceSpecification': {'Containers': [{'Image': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3', 'ImageDigest': 'sha256:e7fea5cd095518578d5cba832758d90d59cba68b7858464aabd2bffd83d96d03', 'ModelDataUrl': 's3://sagemaker-us-east-1-218208277580/waxtqwfd937y-Train-An-gVDb3Vrlj3-002-0cd6a0b8/output/model.tar.gz'}], 'SupportedTransformInstanceTypes': ['ml.m5.xlarge'], 'SupportedRealtimeInferenceInstanceTypes': ['ml.t2.medium', 'ml.m5.xlarge', 'ml.m5.large'], 'SupportedContentTypes': ['text/csv'], 'SupportedResponseMIMETypes': ['text/csv']}, 'ModelPackageStatus': 'Completed', 'ModelPackageStatusDetails': {'Validatio

In [23]:
## create the model image based on the model data and refer to the inference script as an entry point for 
## batch inference
model = Model(
    image_uri=image_uri,
    entry_point=config['scripts']['batch_inference'],
    model_data=model_data_url,
    sagemaker_session=pipeline_session,
    role=role,
)

#### Create the model image from the approved model for batch inference in the next step

In [24]:
step_create_model = ModelStep(
    name=config['register_model_step']['model_name'],
    step_args=model.create(instance_type=config['transform_step']['instance_type']),
)

### Define a Transform Step to Perform Batch Transformation

Now that a model instance is defined, create a Transformer instance with the appropriate model type, compute instance type, and desired output S3 URI.

Specifically, pass in the ModelName from the CreateModelStep, step_create_model properties. The CreateModelStep properties attribute matches the object model of the DescribeModel response object.

In [25]:


# Capture the current time for recording the start and end time for the batch transform step
et = datetime.utcnow()
st = et - timedelta(hours=config['transform_step']['num_hours_to_go_back'])
transformer = Transformer(
    model_name=step_create_model.properties.ModelName,
    instance_type=config['transform_step']['instance_type'],
    instance_count=config['transform_step']['instance_count'],
    strategy="MultiRecord",
    accept="text/csv",
    assemble_with="Line",
    output_path=f"s3://{bucket}",
    tags = config['transform_step']['tags'], 
    env={
        'START_TIME_UTC': st.strftime('%Y-%m-%d %H:%M:%S'), 
        'END_TIME_UTC': et.strftime('%Y-%m-%d %H:%M:%S'),
    }
    
)

### Pass in the transformer instance and the TransformInput with the batch_data pipeline parameter defined earlier.

In [26]:
from sagemaker.inputs import TransformInput
from sagemaker.workflow.steps import TransformStep

# Assuming batch_prediction_data is the S3 path where your input data is stored
transform_input = TransformInput(
    data=batch_data_prep.properties.ProcessingOutputConfig.Outputs[
                "batch" ## this refers to the batch data that is configured within s3 after the batch preprocessing step
            ].S3Output.S3Uri,
    
    content_type="text/csv", 
    split_type="Line")

step_transform = TransformStep(
    name=config['transform_step']['step_name'], transformer=transformer, inputs=transform_input, 
)

In [29]:
from sagemaker.workflow.pipeline import Pipeline

pipeline_name = config['pipeline']['transform_pipeline_name']

batch_transform_pipeline = Pipeline(
    name=pipeline_name,
    parameters=
    [processing_instance_type,
    host_parameter,
    presto_parameter,
    region_parameter,
    port_parameter,
    target_parameter, 
    feature_parameter,
    presto_catalog_parameter,
    presto_schema_parameter,],
    
    steps=[
        batch_data_prep,
        step_create_model, 
        step_transform,
    ],
)

In [30]:
batch_transform_pipeline.upsert(role_arn=role, tags = config['pipeline']['tags'])

[2024-02-25 00:04:30,876] p6796 {processing.py:1884} INFO - Uploaded code to s3://sagemaker-us-east-1-218208277580/mlops-batch-inference/code/ba1054aec71d572583ac1dc27fb900ed/sourcedir.tar.gz
[2024-02-25 00:04:30,907] p6796 {processing.py:1976} INFO - runproc.sh uploaded to s3://sagemaker-us-east-1-218208277580/mlops-batch-inference/code/39e432141f034d8953ce214e26275035/runproc.sh
[2024-02-25 00:04:31,472] p6796 {processing.py:1884} INFO - Uploaded code to s3://sagemaker-us-east-1-218208277580/mlops-batch-inference/code/ba1054aec71d572583ac1dc27fb900ed/sourcedir.tar.gz
[2024-02-25 00:04:31,498] p6796 {processing.py:1976} INFO - runproc.sh uploaded to s3://sagemaker-us-east-1-218208277580/mlops-batch-inference/code/39e432141f034d8953ce214e26275035/runproc.sh


{'PipelineArn': 'arn:aws:sagemaker:us-east-1:218208277580:pipeline/mlops-batch-inference',
 'ResponseMetadata': {'RequestId': '073c63bc-b6bc-4c90-ba0d-5a93467e6ca7',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '073c63bc-b6bc-4c90-ba0d-5a93467e6ca7',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '89',
   'date': 'Sun, 25 Feb 2024 00:04:31 GMT'},
  'RetryAttempts': 0}}

In [31]:
execution = batch_transform_pipeline.start()

In [32]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:218208277580:pipeline/mlops-batch-inference',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:218208277580:pipeline/mlops-batch-inference/execution/l4cyqc13rbwk',
 'PipelineExecutionDisplayName': 'execution-1708819473137',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2024, 2, 25, 0, 4, 33, 87000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 2, 25, 0, 4, 33, 87000, tzinfo=tzlocal()),
 'CreatedBy': {},
 'LastModifiedBy': {},
 'ResponseMetadata': {'RequestId': '20211a4b-675a-4106-9202-0d9e65415c99',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '20211a4b-675a-4106-9202-0d9e65415c99',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '407',
   'date': 'Sun, 25 Feb 2024 00:04:33 GMT'},
  'RetryAttempts': 0}}

In [33]:
st = time.perf_counter()
logger.info(f"starting pipeline={batch_transform_pipeline.name}")
execution.wait()
elapsed_time = time.perf_counter() - st
logger.info(f"pipeline={batch_transform_pipeline.name} took {elapsed_time:.2f} seconds to run")

[2024-02-25 00:04:33,975] p6796 {392233496.py:2} INFO - starting pipeline=mlops-batch-inference
[2024-02-25 00:14:36,509] p6796 {392233496.py:5} INFO - pipeline=mlops-batch-inference took 602.53 seconds to run


In [34]:
print_pipeline_execution_summary(execution.list_steps(), batch_transform_pipeline.name)

[2024-02-25 00:14:36,616] p6796 {utils.py:20} INFO - pipeline steps=[
  {
    "StepName": "mlops-RandomForestTransform",
    "StartTime": "2024-02-25 00:09:29.372000+00:00",
    "EndTime": "2024-02-25 00:14:28.851000+00:00",
    "StepStatus": "Succeeded",
    "Metadata": {
      "TransformJob": {
        "Arn": "arn:aws:sagemaker:us-east-1:218208277580:transform-job/pipelines-l4cyqc13rbwk-mlops-RandomForestTr-MO3TVjHpIk"
      }
    },
    "AttemptCount": 1
  },
  {
    "StepName": "mlops-presto-CreateModel",
    "StartTime": "2024-02-25 00:04:34.306000+00:00",
    "EndTime": "2024-02-25 00:04:35.847000+00:00",
    "StepStatus": "Succeeded",
    "Metadata": {
      "Model": {
        "Arn": "arn:aws:sagemaker:us-east-1:218208277580:model/pipelines-l4cyqc13rbwk-mlops-presto-createm-du8zs6t3hh"
      }
    },
    "AttemptCount": 1
  },
  {
    "StepName": "Preprocess-Data",
    "StartTime": "2024-02-25 00:04:34.306000+00:00",
    "EndTime": "2024-02-25 00:09:28.794000+00:00",
    "StepSt