# Batch Transform on Amazon SageMaker Pipelines Integrated with PrestoDB


***This notebook works best with the `Data Science 3.0` kernel on an `ml.t3.medium` instance type***.

Run the [0_model_training_pipeline](./0_model_training_pipeline.ipynb) notebook prior to running the notebook. This notebook runs a batch transform using the model trained in the previous notebook. It does so by running the following steps:

1. Extract the latest approved model from the SageMaker model registry.

1. Read raw data for inference from PrestoDB and stores in an Amazon S3 bucket.

1. Create a SageMaker pipeline with a data processing step and a batch transform step to provide inference on the data. The inference results are also stored in S3.

In [1]:
#import sys
#!{sys.executable} -m pip install -r requirements.txt

In [2]:
## Install the necessary boto3 and sagemaker libraries to initialize session
import json
import boto3
import time
import logging
import sagemaker
import sagemaker.session
from typing import Dict, List
from datetime import datetime, timedelta
from sagemaker.workflow.parameters import ParameterString
from sagemaker.workflow.pipeline_context import PipelineSession
from utils import load_config, make_s3_prefix, print_pipeline_execution_summary

from sagemaker.workflow.functions import Join
from sagemaker.processing import  ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.execution_variables import ExecutionVariables

from sagemaker.model import Model
from sagemaker.inputs import CreateModelInput
from sagemaker.workflow.model_step import ModelStep
from sagemaker.transformer import Transformer

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\aroraai\AppData\Local\sagemaker\sagemaker\config.yaml


In [3]:
## set the logger to track all of the logs as this pipeline runs
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

### Load the Config.yml file that contains information that is used across this pipeline

In [4]:
config = load_config('config.yml')
logger.info(json.dumps(config, indent=2))

[2024-02-23 19:54:15,951] p17664 {2294058105.py:2} INFO - {
  "aws": {
    "region": "us-east-1",
    "sagemaker_execution_role": "arn:aws:iam::015469603702:role/SageMakerRepoRole",
    "s3_bucket": "sagemaker-{region}-{account_id}",
    "s3_prefix": "mlops-pipeline-model"
  },
  "presto": {
    "host": "3.93.186.209",
    "parameter": "8080",
    "user": "ec2-user"
  },
  "pipeline": {
    "training_pipeline_name": "mlops-pipeline-presto",
    "transform_pipeline_name": "mlops-batch-inference",
    "execution_display_name": "mlops-prestodb-pipeline",
    "tags": [
      {
        "Key": "team",
        "Value": "my-team"
      }
    ]
  },
  "training_step": {
    "training_target": "high_value_order",
    "training_features": [
      "total_extended_price",
      "avg_discount",
      "total_quantity"
    ],
    "sklearn_framework_version": "0.23-1",
    "n_estimators": 75,
    "max_depth": 10,
    "min_samples_split": 2,
    "max_features": "sqrt",
    "instance_type": "ml.m5.xlarge

In [5]:
## initialize the sagemaker session, region, role bucket and pipeline session
session = sagemaker.session.Session()
region = session.boto_region_name
pipeline_session = PipelineSession()

role = config['aws']['sagemaker_execution_role']
ci = boto3.client('sts').get_caller_identity()
bucket = config['aws']['s3_bucket'].format(account_id=ci['Account'], region=region)
prefix = config['aws']['s3_prefix']  # Prefix to S3 artifacts

logger.info(f"bucket={bucket}, prefix={prefix}, role={role}")

[2024-02-23 19:54:16,051] p17664 {credentials.py:1278} INFO - Found credentials in shared credentials file: ~/.aws/credentials
[2024-02-23 19:54:17,295] p17664 {credentials.py:1278} INFO - Found credentials in shared credentials file: ~/.aws/credentials
[2024-02-23 19:54:18,825] p17664 {credentials.py:1278} INFO - Found credentials in shared credentials file: ~/.aws/credentials
[2024-02-23 19:54:21,567] p17664 {564713264.py:14} INFO - bucket=sagemaker-us-east-1-015469603702, prefix=mlops-pipeline-model, batch_transform_inference_prefix=batch_transform_inference/yyyy=2024/mm=2/dd=23/hh=14/mm=24,             batch_transform_data_prefix=batch_transform_data/yyyy=2024/mm=2/dd=23/hh=14/mm=24, role=arn:aws:iam::015469603702:role/SageMakerRepoRole


In [6]:
# Convert your list to a JSON string
training_features_str = json.dumps(config['training_step']['training_features'])
logger.info(f"the training features being used for this pipeline --> {training_features_str}")

# Define new pipeline parameters
host_parameter = ParameterString(name="HostParameter", default_value=config['presto']['host'])
port_parameter = ParameterString(name="PortParameter", default_value=config['presto']['parameter'])
user_parameter = ParameterString(name="UserParameter", default_value=config['presto']['user'])
target_parameter = ParameterString(name="Target", default_value=config['training_step']['training_target'])
feature_parameter = ParameterString(name="Feature", default_value=training_features_str)

[2024-02-23 19:54:21,626] p17664 {1309184461.py:3} INFO - the training features being used for this pipeline --> ["total_extended_price", "avg_discount", "total_quantity"]


<a id='parameters'></a>

### Pipeline input parameters

Pipeline Parameters are input parameter when triggering a pipeline execution. They need to be explicitly defined when creating the pipeline and contain default values.

Create parameters for the inputs to the pipeline. In this case, parameters will be used for:

- `ProcessingInstanceType` - What EC2 instance type to use for processing.
- `TrainingInstanceType` - What EC2 instance type to use for training.

In [7]:
from sagemaker.sklearn.processing import SKLearnProcessor

# What instance type to use for processing.
processing_instance_type = ParameterString(
    name="ProcessingInstanceType", default_value=config['data_processing_step']['processing_instance_type']
)

## initializing the sklearn processor
sklearn_processor = SKLearnProcessor(framework_version=config['training_step']['sklearn_framework_version'],
                                     role=role,
                                     instance_type=processing_instance_type,
                                     instance_count=config['data_processing_step']['instance_count'])

[2024-02-23 19:54:22,253] p17664 {image_uris.py:581} INFO - Defaulting to only available Python version: py3


#### Create an Image URI object to use while creating the model from the approved model in the registry

In [8]:
# Fetch container to use for training
image_uri = sagemaker.image_uris.retrieve(
    framework="sklearn",
    region=config['aws']['region'],
    version=config['training_step']['sklearn_framework_version'],
    py_version="py3",
    instance_type=config['data_processing_step']['processing_instance_type'],
)
logger.info(f"processing step image_uri={image_uri}")

[2024-02-23 19:54:22,590] p17664 {2531010310.py:9} INFO - processing step image_uri=683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3


### Now, step is to approve the model
---
Finally, approve the model to launch the model deployment process

In [9]:
sm = boto3.client("sagemaker")

# list all model packages and select the latest one
model_packages = []

for p in sm.get_paginator('list_model_packages').paginate(
        ModelPackageGroupName=config['register_model_step']['model_group'],
        SortBy="CreationTime",
        SortOrder="Descending",
    ):
    model_packages.extend(p["ModelPackageSummaryList"])

if len(model_packages) == 0:
    raise Exception(f"No model package is found for {config['register_model_step']['model_group']} model package group")

## print the latest model, approve it
latest_model_package_arn = model_packages[0]["ModelPackageArn"]
logger.info(f"for model_group={config['register_model_step']['model_group']}, latest_model_package_arn={latest_model_package_arn}")

[2024-02-23 19:54:24,879] p17664 {1612686606.py:18} INFO - for model_group=mlops-presto, latest_model_package_arn=arn:aws:sagemaker:us-east-1:015469603702:model-package/mlops-presto/3


The following statement sets the ModelApprovalStatus for the model package to Approved. The model package state change will launch the EventBridge rule and the rule will launch the CodePipeline CI/CD pipeline with model deployment.

In [10]:
## updating the latest model package to approved status to use it for batch inference
model_package_update_response = sm.update_model_package(
    ModelPackageArn=latest_model_package_arn,
    ModelApprovalStatus="Approved",
)

## PART 2: Batch Transform Pipeline: Prepare Batch Data & Perform Batch Inference

### first step is to get the latest batch data from presto and use that for batch transform step

In [11]:
# Use the sklearn_processor in a SageMaker Pipelines ProcessingStep
# Configure the ProcessingStep
batch_data_prep = ProcessingStep(
    name=config['data_processing_step']['step_name'],
    processor=sklearn_processor,
    outputs=[
        ProcessingOutput(
            output_name="batch",
            source="/opt/ml/processing/batch",
            destination=Join(
                on="/",
                values=[
                    "s3://{}".format(bucket),
                    prefix,
                    ExecutionVariables.PIPELINE_EXECUTION_ID,
                    "batch",
                ], 
            ),
        ),
    ],
    code = config['scripts']['batch_transform_get_data'],
    job_arguments=[
        ## these job parameters are required in the process of getting batch data from presto
        ## and then send it to s3 for the process of batch inference
        "--host", host_parameter, ## represents the host parameter for the batch data
        "--port", port_parameter, ## represents the port for the EC2
        "--user", user_parameter, ## represents the username for the presto
    ],
)

### Batch Transform Configuration begins below:
---

1. Create the model with the model image uri, refer to the 'inference.py' script that grabs information on features to use while making predictions.

2. Create the model which automatically triggers the training and the preprocess data step

3. Run the transformer step on the created model and 

In [12]:
client = boto3.client("sagemaker")
list_model_packages_response = client.list_model_packages(ModelPackageGroupName=config['register_model_step']['model_group'])
logger.info(f"list_model_packages_response={list_model_packages_response}")

latest_model_version_arn = list_model_packages_response["ModelPackageSummaryList"][0][
    "ModelPackageArn"
]
logger.info(f"latest_model_version_arn={latest_model_version_arn}")

[2024-02-23 19:54:27,506] p17664 {1380157791.py:3} INFO - list_model_packages_response={'ModelPackageSummaryList': [{'ModelPackageGroupName': 'mlops-presto', 'ModelPackageVersion': 3, 'ModelPackageArn': 'arn:aws:sagemaker:us-east-1:015469603702:model-package/mlops-presto/3', 'CreationTime': datetime.datetime(2024, 2, 23, 19, 28, 38, 551000, tzinfo=tzlocal()), 'ModelPackageStatus': 'Completed', 'ModelApprovalStatus': 'Approved'}, {'ModelPackageGroupName': 'mlops-presto', 'ModelPackageVersion': 2, 'ModelPackageArn': 'arn:aws:sagemaker:us-east-1:015469603702:model-package/mlops-presto/2', 'CreationTime': datetime.datetime(2024, 2, 23, 19, 11, 42, 189000, tzinfo=tzlocal()), 'ModelPackageStatus': 'Completed', 'ModelApprovalStatus': 'PendingManualApproval'}, {'ModelPackageGroupName': 'mlops-presto', 'ModelPackageVersion': 1, 'ModelPackageArn': 'arn:aws:sagemaker:us-east-1:015469603702:model-package/mlops-presto/1', 'CreationTime': datetime.datetime(2024, 2, 23, 19, 11, 18, 479000, tzinfo=tzl

In [13]:
try:
    latest_approved_model_package = client.describe_model_package(ModelPackageName=latest_model_version_arn)

    if latest_approved_model_package['ModelApprovalStatus'] == "Approved":
        logger.info(f"The latest approved model package is --> {latest_approved_model_package}")
        model_data_url = latest_approved_model_package['InferenceSpecification']['Containers'][0]['ModelDataUrl']
        logger.info(f"The model data for the latest approved model arn {latest_model_version_arn} is stored in {model_data_url}")
    else:
        # If the model approval status is not PendingApproval, throw an error exception
        error_message = f"ModelApprovalStatus is not PendingApproval. Current status: {latest_approved_model_package['ModelApprovalStatus']}"
        logger.error(error_message)
        raise ValueError(error_message)

except Exception as e:
    logger.error(f"An error occurred while tracking the approved model: {str(e)}")
    raise e



[2024-02-23 19:54:27,825] p17664 {226519901.py:5} INFO - The latest approved model package is --> {'ModelPackageGroupName': 'mlops-presto', 'ModelPackageVersion': 3, 'ModelPackageArn': 'arn:aws:sagemaker:us-east-1:015469603702:model-package/mlops-presto/3', 'CreationTime': datetime.datetime(2024, 2, 23, 19, 28, 38, 551000, tzinfo=tzlocal()), 'InferenceSpecification': {'Containers': [{'Image': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3', 'ImageDigest': 'sha256:26e6faf825e29bb9fd048d3af6dc1c4e4fb1a172caa49bf7f68112324447f850', 'ModelDataUrl': 's3://sagemaker-us-east-1-015469603702/7dtbiu1emnry-Train-An-2I3X0W7DVE-001-1427a8db/output/model.tar.gz'}], 'SupportedTransformInstanceTypes': ['ml.m5.xlarge'], 'SupportedRealtimeInferenceInstanceTypes': ['ml.t2.medium', 'ml.m5.xlarge', 'ml.m5.large'], 'SupportedContentTypes': ['text/csv'], 'SupportedResponseMIMETypes': ['text/csv']}, 'ModelPackageStatus': 'Completed', 'ModelPackageStatusDetails': {'Validati

In [14]:
## create the model image based on the model data and refer to the inference script as an entry point for 
## batch inference
model = Model(
    image_uri=image_uri,
    entry_point=config['scripts']['batch_inference'],
    model_data=model_data_url,
    sagemaker_session=pipeline_session,
    role=role,
)

#### Create the model image from the approved model for batch inference in the next step

In [15]:
step_create_model = ModelStep(
    name=config['register_model_step']['model_name'],
    step_args=model.create(instance_type=config['transform_step']['instance_type']),
)



### Define a Transform Step to Perform Batch Transformation

Now that a model instance is defined, create a Transformer instance with the appropriate model type, compute instance type, and desired output S3 URI.

Specifically, pass in the ModelName from the CreateModelStep, step_create_model properties. The CreateModelStep properties attribute matches the object model of the DescribeModel response object.

In [16]:


# Capture the current time for recording the start and end time for the batch transform step
et = datetime.utcnow()
st = et - timedelta(hours=config['transform_step']['num_hours_to_go_back'])
transformer = Transformer(
    model_name=step_create_model.properties.ModelName,
    instance_type=config['transform_step']['instance_type'],
    instance_count=config['transform_step']['instance_count'],
    strategy="MultiRecord",
    accept="text/csv",
    assemble_with="Line",
    output_path=f"s3://{bucket}",
    tags = config['transform_step']['tags'], 
    env={
        'START_TIME_UTC': st.strftime('%Y-%m-%d %H:%M:%S'), 
        'END_TIME_UTC': et.strftime('%Y-%m-%d %H:%M:%S'),
    }
    
)

### Pass in the transformer instance and the TransformInput with the batch_data pipeline parameter defined earlier.

In [17]:
from sagemaker.inputs import TransformInput
from sagemaker.workflow.steps import TransformStep

# Assuming batch_prediction_data is the S3 path where your input data is stored
transform_input = TransformInput(
    data=batch_data_prep.properties.ProcessingOutputConfig.Outputs[
                "batch" ## this refers to the batch data that is configured within s3 after the batch preprocessing step
            ].S3Output.S3Uri,
    
    content_type="text/csv", 
    split_type="Line")

step_transform = TransformStep(
    name=config['transform_step']['step_name'], transformer=transformer, inputs=transform_input, 
)

In [18]:
from sagemaker.workflow.pipeline import Pipeline

pipeline_name = config['pipeline']['transform_pipeline_name']

batch_transform_pipeline = Pipeline(
    name=pipeline_name,
    parameters=
    [processing_instance_type,
    host_parameter,
    port_parameter,
    user_parameter,
    target_parameter, 
    feature_parameter,],
    
    steps=[
        batch_data_prep,
        step_create_model, 
        step_transform,
    ],
)

In [19]:
batch_transform_pipeline.upsert(role_arn=role, tags = config['pipeline']['tags'])



{'PipelineArn': 'arn:aws:sagemaker:us-east-1:015469603702:pipeline/mlops-batch-inference',
 'ResponseMetadata': {'RequestId': '0a46d64c-8a00-4ee4-a5e5-b12a776fd0f2',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '0a46d64c-8a00-4ee4-a5e5-b12a776fd0f2',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '89',
   'date': 'Fri, 23 Feb 2024 14:24:50 GMT'},
  'RetryAttempts': 0}}

In [20]:
execution = batch_transform_pipeline.start()

In [21]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:015469603702:pipeline/mlops-batch-inference',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:015469603702:pipeline/mlops-batch-inference/execution/p7fq8w4xbveu',
 'PipelineExecutionDisplayName': 'execution-1708698292901',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2024, 2, 23, 19, 54, 52, 854000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 2, 23, 19, 54, 52, 854000, tzinfo=tzlocal()),
 'CreatedBy': {},
 'LastModifiedBy': {},
 'ResponseMetadata': {'RequestId': 'efd7f4bf-5843-48b7-b216-e378906e977a',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'efd7f4bf-5843-48b7-b216-e378906e977a',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '407',
   'date': 'Fri, 23 Feb 2024 14:24:52 GMT'},
  'RetryAttempts': 0}}

In [22]:
st = time.perf_counter()
execution.wait()
et = time.perf_counter() - st
logger.info(f"pipeline={batch_transform_pipeline.name} took {et-st:.2f} seconds to run")

[2024-02-23 20:05:09,019] p17664 {2209960028.py:4} INFO - pipeline=mlops-batch-inference took -174150.27 seconds to run


In [23]:
print_pipeline_execution_summary(execution.list_steps(), batch_transform_pipeline.name)

NameError: name 'json' is not defined