Please choose Python 3 (Data Science) kernel to proceed.

Install Packages

In [None]:
#Install sagemaker_studio_image_build utility
import sys

!{sys.executable} -m pip install sagemaker_studio_image_build

The Dockerfile

The Dockerfile describes the image that we want to build. You can think of it as describing the complete operating system installation of the system that you want to run. A Docker container running is quite a bit lighter than a full operating system, however, because it takes advantage of Linux on the host machine for the basic operations.

For demonstration of this example ,we will use python-slim base containers, we add the code that implements our specific inference code to the container and set up the right environment to run under.

In [None]:
!cd InferenceContainer
!cat Dockerfile

Building and Registering the container

In [None]:
%%sh

sm-docker build . --repository legacycode:latest

In [None]:
import sagemaker
import boto3

session = sagemaker.Session()

# Set a default S3 bucket
default_bucket = session.default_bucket()

# Get the region
region = boto3.Session().region_name

# Get the account
account = session.boto_session.client('sts').get_caller_identity()['Account']

# Get the SageMaker Execution Role
role = sagemaker.get_execution_role()

# Upload the inputt data and scripts into S3 bucket
S3_prefix= "legacycode2"

scripts_directory= "../scripts"
scripts_location = session.upload_data(scripts_directory, 
                                 key_prefix=S3_prefix+"/scripts")

data_directory= "../data"
input_location = session.upload_data(data_directory, 
                                 key_prefix=S3_prefix+"/data/preproc/input")

In [None]:
from sagemaker import image_uris
sklearn_image_uri=image_uris.retrieve(framework='sklearn',region=region,version='0.23-1',image_scope='training')
print(sklearn_image_uri)

In [None]:
## Create pre-processing job in script mode job using the pre-built sci-kit learn container
import os
import json
import boto3
import time

sm = boto3.client('sagemaker')

# Define parameters
instance_type = "ml.m5.xlarge"
volume_size = 20
max_runtime = 3600  # Default: 1h
entrypoint = "/opt/ml/code/predict.py"

timestamp = time.strftime('%Y%m%d-%H%M%S')
job_name = f'sm-preprocessingjob-{timestamp}' 


#s3://sagemaker-us-west-2-656399771937/legacycode1/scripts/data/preproc/input

# Define inputs/outputs

create_preprocessing_params = {
    "ProcessingInputs": [
            {
                'InputName': 'input_data',
                'S3Input': {
                    'S3Uri': "s3://{}/{}/data/preproc/input".format(default_bucket, S3_prefix),
                    'LocalPath': '/opt/ml/processing/input/data/',
                    'S3DataType': 'S3Prefix',
                    'S3InputMode': 'File'
                }
            },
            {
                'InputName': 'scripts',
                'S3Input': {
                    'S3Uri': "s3://{}/{}/scripts".format(default_bucket,S3_prefix),
                    'LocalPath': '/opt/ml/processing/input/scripts/',
                    'S3DataType': 'S3Prefix',
                    'S3InputMode': 'File'
                }
            }
    ],
    "ProcessingOutputConfig": {
        'Outputs': [
            {
                'OutputName': 'output_data',
                'S3Output': {
                    'S3Uri': "s3://{}/{}/data/predict/input".format(default_bucket,S3_prefix),
                    'LocalPath': '/opt/ml/processing/output',
                    'S3UploadMode': 'EndOfJob'
                }
            }
        ]
    },
    "ProcessingJobName": job_name,
    "ProcessingResources": {
        'ClusterConfig': {
            'InstanceCount': 1,
            'InstanceType': instance_type,
            'VolumeSizeInGB': volume_size
        }
    },
    "StoppingCondition": {
        'MaxRuntimeInSeconds': max_runtime
    },
    "AppSpecification": {
        'ImageUri': sklearn_image_uri,
        'ContainerEntrypoint': ['python',"/opt/ml/processing/input/scripts/preprocess.py"]
    },
    "RoleArn": role
}
# Create processing job and return job ARN
sm.create_processing_job(**create_preprocessing_params)

In [None]:
## Create processing job using the customer container built in the above cell
import os
import json
import boto3
import time
from sagemaker import get_execution_role

sm = boto3.client('sagemaker')


# Get parameters
image_uri = '{}.dkr.ecr.{}.amazonaws.com/legacycode:latest'.format(account, region)
instance_type = "ml.m5.xlarge"
volume_size = 20
max_runtime = 3600  # Default: 1h
entrypoint = "/opt/ml/code/predict.py"

timestamp = time.strftime('%Y%m%d-%H%M%S')
job_name = f'sm-processing-job-{timestamp}' 

# Define inputs/outputs

create_processing_params = {
    "ProcessingInputs": [
            {
                'InputName': 'input_data',
                'S3Input': {
                    'S3Uri': "s3://{}/{}/data/predict/input".format(default_bucket, S3_prefix),
                    'LocalPath': '/opt/ml/processing/input',
                    'S3DataType': 'S3Prefix',
                    'S3InputMode': 'File'
                }
            }
    ],
    "ProcessingOutputConfig": {
        'Outputs': [
            {
                'OutputName': 'output_data',
                'S3Output': {
                    'S3Uri': "s3://{}/{}/data/postproc/input".format(default_bucket, S3_prefix),
                    'LocalPath': '/opt/ml/processing/output',
                    'S3UploadMode': 'EndOfJob'
                }
            }
        ]
    },
    "ProcessingJobName": job_name,
    "ProcessingResources": {
        'ClusterConfig': {
            'InstanceCount': 1,
            'InstanceType': instance_type,
            'VolumeSizeInGB': volume_size
        }
    },
    "StoppingCondition": {
        'MaxRuntimeInSeconds': max_runtime
    },
    "AppSpecification": {
        'ImageUri': image_uri,
        'ContainerEntrypoint': ['python', entrypoint]
    },
    "RoleArn": role
}
# Create processing job and return job ARN
sm.create_processing_job(**create_processing_params)

In [None]:
## Create post-processing job in script mode job using the pre-built sci-kit learn container
import os
import json
import boto3
import time
from sagemaker import get_execution_role

sm = boto3.client('sagemaker')

# Define parameters
instance_type = "ml.m5.xlarge"
volume_size = 20
max_runtime = 3600  # Default: 1h

timestamp = time.strftime('%Y%m%d-%H%M%S')
job_name = f'sm-procesing-job-{timestamp}' 

# Define inputs/outputs

create_postprocessing_params = {
    "ProcessingInputs": [
            {
                'InputName': 'input_data',
                'S3Input': {
                    'S3Uri': "s3://{}/{}/data/postproc/input".format(default_bucket, S3_prefix),
                    'LocalPath': '/opt/ml/processing/input/data/',
                    'S3DataType': 'S3Prefix',
                    'S3InputMode': 'File'
                }
            },
            {
                'InputName': 'scripts',
                'S3Input': {
                    'S3Uri': "s3://{}/{}/scripts".format(default_bucket,S3_prefix),
                    'LocalPath': '/opt/ml/processing/input/scripts/',
                    'S3DataType': 'S3Prefix',
                    'S3InputMode': 'File'
                }
            }
    ],
    "ProcessingOutputConfig": {
        'Outputs': [
            {
                'OutputName': 'output_data',
                'S3Output': {
                    'S3Uri': "s3://{}/{}/data/postproc/output".format(default_bucket, S3_prefix),
                    'LocalPath': '/opt/ml/processing/output',
                    'S3UploadMode': 'EndOfJob'
                }
            }
        ]
    },
    "ProcessingJobName": job_name,
    "ProcessingResources": {
        'ClusterConfig': {
            'InstanceCount': 1,
            'InstanceType': instance_type,
            'VolumeSizeInGB': volume_size
        }
    },
    "StoppingCondition": {
        'MaxRuntimeInSeconds': max_runtime
    },
    "AppSpecification": {
        'ImageUri': sklearn_image_uri,
        'ContainerEntrypoint': ['python'],
        "ContainerArguments": [
          "/opt/ml/processing/input/scripts/preprocess.py"
        ]
    },
    "RoleArn": role
}
# Create processing job and return job ARN
sm.create_processing_job(**create_postprocessing_params)

In [None]:
#Install requirements locally
!{sys.executable} -m pip install -r src/requirements.txt

In [None]:
#Test locally
!python /root/Blog_LegacyCode_MLOps/Template/InferenceContainer/src/predict.py local /root/Blog_LegacyCode_MLOps/Template/data/