## Perform Batch Inference (Predictions) using SageMaker Batch Transform

**Note:** This example notebook uses boto3 instead of the high-level SageMaker Python SDK.

##### Imports 

In [None]:
from sagemaker import get_execution_role
from time import gmtime, strftime
import pandas as pd
import sagemaker
import boto3
import time

### Essentials

In [None]:
BUCKET = sagemaker.Session().default_bucket()
PREFIX = 'clf'
REGION = 'us-east-1'

In [None]:
batch_input = f's3://{BUCKET}/{PREFIX}/batch_test/'
batch_input

In [None]:
batch_output = f's3://{BUCKET}/{PREFIX}/batch_test_out/'
batch_output

In [None]:
current_timestamp = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

In [None]:
TRAINING_JOB_NAME = 'classifier-2021-04-15-15-54-02-174'  # Copy this from the console
MODEL_NAME = f'clf-xgboost-model-{current_timestamp}'
BATCH_JOB_NAME = f'clf-xgboost-batch-job-{current_timestamp}'

session = boto3.Session()
sagemaker_execution_role = get_execution_role()
sagemaker_session = sagemaker.session.Session()
sagemaker_client = boto3.client('sagemaker', region_name=REGION)
s3_client = boto3.client('s3')

container_uri = sagemaker.image_uris.retrieve(region=session.region_name, 
                                              framework='xgboost', 
                                              version='1.0-1', 
                                              image_scope='training')

### Create a Model object using previously run training job name

In [None]:
info = sagemaker_client.describe_training_job(TrainingJobName=TRAINING_JOB_NAME)
info

In [None]:
model_artifact_url = info['ModelArtifacts']['S3ModelArtifacts']
model_artifact_url

In [None]:
primary_container = {
    'Image': container_uri,
    'ModelDataUrl': model_artifact_url
  }

In [None]:
response = sagemaker_client.create_model(
    ModelName=MODEL_NAME,
    ExecutionRoleArn=sagemaker_execution_role,
    PrimaryContainer=primary_container)

In [None]:
response

### Create a Batch Transformer for batch inference
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html

* `MaxConcurrentTransforms` (integer) --
The maximum number of parallel requests that can be sent to each instance in a transform job. The default value is 1.

* `MaxPayloadInMB` (integer) --
The maximum payload size allowed, in MB. A payload is the data portion of a record (without metadata).

* `BatchStrategy` (string) --
A string that determines the number of records included in a single mini-batch.

SingleRecord means only one record is used per mini-batch. MultiRecord means a mini-batch is set to contain as many records that can fit within the MaxPayloadInMB limit.

In [None]:
request = {
    "TransformJobName": BATCH_JOB_NAME,
    "ModelName": MODEL_NAME,
    "BatchStrategy": "MultiRecord",
    "TransformOutput": {
        "S3OutputPath": batch_output
    },
    "TransformInput": {
        "DataSource": {
            "S3DataSource": {
                "S3DataType": "S3Prefix",
                "S3Uri": batch_input 
            }
        },
        "ContentType": "text/csv",
        "SplitType": "Line",
        "CompressionType": "None"
    },
    "TransformResources": {
            "InstanceType": "ml.m5.xlarge",
            "InstanceCount": 2
    }
}

In [None]:
response = sagemaker_client.create_transform_job(**request)
response

In [None]:
while(True):
    response = sagemaker_client.describe_transform_job(TransformJobName=BATCH_JOB_NAME)
    status = response['TransformJobStatus']
    if  status == 'Completed':
        print("Transform job ended with status: {}".format(status))
        break
    if status == 'Failed':
        message = response['FailureReason']
        print('Transform failed with the following error: {}'.format(message))
        raise Exception('Transform job failed') 
    print("Transform job is still in status: {}".format(status))    
    time.sleep(30) 

### 4. Evaluate Output

In [None]:
key = f'{PREFIX}/batch_test_out/batch_test.csv.out'

In [None]:
obj = s3_client.get_object(Bucket=BUCKET, Key=key)
results_df = pd.read_csv(obj['Body'], names=['Predictions'])

In [None]:
results_df