# Script Mode Blog Post

Imports

In [None]:
import sagemaker
import subprocess
import sys
import random
import math
import pandas as pd
import os
import boto3
import numpy as np
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
from sagemaker.pytorch import PyTorch
from sagemaker.xgboost import XGBoost
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.serializers import NumpySerializer, JSONSerializer, CSVSerializer
from sagemaker.deserializers import NumpyDeserializer, JSONDeserializer
from sagemaker.predictor import Predictor

Make sure your SageMaker version is updated.

In [None]:
# SageMaker Python SDK version 2.x is required
original_version = sagemaker.__version__
if sagemaker.__version__ != '2.24.1':
    subprocess.check_call(
        [sys.executable, '-m', 'pip', 'install', 'sagemaker==2.24.1']
    )
    import importlib
    importlib.reload(sagemaker)

Session variables

In [None]:
random.seed(42)

# Useful SageMaker variables
try:
    # You're using a SageMaker notebook
    sess = sagemaker.Session()
    bucket = sess.default_bucket()
    role = sagemaker.get_execution_role()
except ValueError:
    # You're using a notebook somewhere else
    print('Setting role and SageMaker session manually...')
    bucket = 'bobby-demo'
    region = 'us-west-2'
    
    iam = boto3.client('iam')
    sagemaker_client = boto3.client('sagemaker')
    
    role = iam.get_role(RoleName='AmazonSageMaker-ExecutionRole-20200630T141851')['Role']['Arn']
    boto3.setup_default_session(region_name=region, profile_name='default')
    sess = sagemaker.Session(sagemaker_client=sagemaker_client, default_bucket=bucket)

# Local data paths
train_dir = os.path.join(os.getcwd(), 'data/train')
test_dir = os.path.join(os.getcwd(), 'data/test')
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Data paths in S3
s3_prefix = 'script-mode-workflow'
csv_s3_prefix = f'{s3_prefix}/csv'
csv_s3_uri = f's3://{bucket}/{s3_prefix}/csv'
numpy_train_s3_prefix = f'{s3_prefix}/numpy/train'
numpy_train_s3_uri = f's3://{bucket}/{numpy_train_s3_prefix}'
numpy_test_s3_prefix = f'{s3_prefix}/numpy/test'
numpy_test_s3_uri = f's3://{bucket}/{numpy_test_s3_prefix}'
csv_train_s3_uri = f'{csv_s3_uri}/train'
csv_test_s3_uri = f'{csv_s3_uri}/test'

# Enable Local Mode training
enable_local_mode_training = False

In [None]:
!wget -q https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-script-mode/master/local_mode_setup.sh
!wget -q https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-script-mode/master/daemon.json    
!/bin/bash ./local_mode_setup.sh

### Prep Boston Housing Data

Load the Boston Housing Data

In [None]:
boston = load_boston()
x, y = boston['data'], boston['target']
training_index = math.floor(.8 * boston['data'].shape[0])
x_train, y_train = x[:training_index], y[:training_index]
x_test, y_test = x[training_index:], y[training_index:]
x_train_np = StandardScaler().fit_transform(x_train)
x_test_np = StandardScaler().fit_transform(x_test)

Convert the data to Pandas

In [None]:
# Convert to Pandas and standardize
train_df = pd.DataFrame(data=x_train)
train_df['target'] = y_train
first_col = train_df.pop('target')
train_df.insert(0, 'target', first_col)

test_df = pd.DataFrame(data=x_test)
test_df = pd.DataFrame(StandardScaler().fit_transform(test_df))
test_df['target'] = y_test
first_col = test_df.pop('target')
test_df.insert(0, 'target', first_col)

Save as both CSV and Numpy data types to demonstrate data type flexibility in model training.

In [None]:
# Save as CSV
train_df.to_csv(f'{train_dir}/train.csv', header=False, index=False)
test_df.to_csv(f'{test_dir}/test.csv', header=False, index=False)

# Save as Numpy
np.save(os.path.join(train_dir, 'x_train.npy'), x_train_np)
np.save(os.path.join(test_dir, 'x_test.npy'), x_test_np)
np.save(os.path.join(train_dir, 'y_train.npy'), y_train)
np.save(os.path.join(test_dir, 'y_test.npy'), y_test)

Upload the data to S3

In [None]:
# Upload to S3
s3_resource_bucket = boto3.Session().resource('s3').Bucket(bucket)
s3_resource_bucket.Object(os.path.join(csv_s3_prefix, 'train.csv')).upload_file('data/train/train.csv')
s3_resource_bucket.Object(os.path.join(csv_s3_prefix, 'test.csv')).upload_file('data/test/test.csv')
s3_resource_bucket.Object(os.path.join(numpy_train_s3_prefix, 'x_train.npy')).upload_file('data/train/x_train.npy')
s3_resource_bucket.Object(os.path.join(numpy_train_s3_prefix, 'y_train.npy')).upload_file('data/train/y_train.npy')
s3_resource_bucket.Object(os.path.join(numpy_test_s3_prefix, 'x_test.npy')).upload_file('data/test/x_test.npy')
s3_resource_bucket.Object(os.path.join(numpy_test_s3_prefix, 'y_test.npy')).upload_file('data/test/y_test.npy')

### Sci-kit learn

Script Mode in SageMaker allows you to take control of the training and inference process without having to go through the trouble of creating and maintaining your own docker containers.

In [None]:
hyperparameters = {'max_depth': 20, 'n_jobs': 4, 'n_estimators': 120}

if enable_local_mode_training:
    train_instance_type = 'local'
    inputs = {'train': f'file://{train_dir}',
              'test': f'file://{test_dir}'}
else:
    train_instance_type = 'ml.c5.xlarge'
    inputs = {'train':csv_train_s3_uri,
              'test': csv_test_s3_uri}

estimator_parameters = {'entry_point': 'train_deploy_scikitlearn_without_dependencies.py',
                        'source_dir': 'scikitlearn_script',
                        'framework_version': '0.23-1',
                        'py_version':'py3',
                        'instance_type': train_instance_type,
                        'instance_count': 1,
                        'hyperparameters': hyperparameters,
                        'role': role,
                        'base_job_name': 'randomforestregressor-model'}

estimator = SKLearn(**estimator_parameters)
estimator.fit(inputs)

After the estimator finishes training, we can deploy it to a SageMaker endpoint.

In [None]:
sklearn_predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m5.xlarge',
                             endpoint_name='randomforestregressor-endpoint')

Then we can use the SageMaker endpoint to make predictions.

In [None]:
sklearn_predictor.predict(x_test)

If you want to come back to this notebook after having already deployed the SageMaker endpoint, you can use the following snippet of code to invoke it.

In [None]:
sklearn_predictor = Predictor(endpoint_name='randomforestregressor-endpoint',
                              sagemaker_session=sess,
                              serializer=NumpySerializer(),
                              deserializer=NumpyDeserializer())

sklearn_predictor.predict(x_test)

### PyTorch

Sometimes keeping your code all in one Python file can be unweidly. Script Mode gives you the flexibility to parse out your code into multiple Python files.

In this PyTorch example, we want to separate the actual neural network definition from the rest of the code by putting it into its own file as demonstrated in the `pytorch_script/` folder.

In [None]:
hyperparameters = {'epochs': 5, 'batch_size': 128, 'learning_rate': 0.01}

if enable_local_mode_training:
    train_instance_type = 'local'
    inputs = {'train': f'file://{train_dir}',
              'test': f'file://{test_dir}'}
else:
    train_instance_type = 'ml.c5.xlarge'
    inputs = {'train':numpy_train_s3_uri,
              'test': numpy_test_s3_uri}

estimator_parameters = {'entry_point':'train_deploy_pytorch_without_dependencies.py',
                        'source_dir': 'pytorch_script',
                        'instance_type' : train_instance_type,
                        'instance_count': 1,
                        'hyperparameters': hyperparameters,
                        'role' : role,
                        'base_job_name':'pytorch-model',
                        'framework_version':'1.5',
                        'py_version':'py3'}

estimator = PyTorch(**estimator_parameters)
estimator.fit(inputs)

Again, after the estimator finishes training, we can deploy it to a SageMaker endpoint.

In [None]:
pytorch_predictor = estimator.deploy(initial_instance_count=1,
                                     instance_type='ml.m5.xlarge',
                                     endpoint_name='pytorch-endpoint')

Then we can use the endpoint to make predictions.

In [None]:
pytorch_predictor.serializer = JSONSerializer()
pytorch_predictor.deserializer = JSONDeserializer()

pytorch_predictor.predict(x_test[0])

If you want to come back to this notebook after having already deployed the SageMaker endpoint, you can use the following snippet of code to invoke it.

In [None]:
pytorch_predictor = Predictor(endpoint_name='pytorch-endpoint',
                              sagemaker_session=sess,
                              serializer=JSONSerializer(),
                              deserializer=JSONDeserializer())

pytorch_predictor.predict(x_test[0])

### XGBoost

Perhaps the number of Python files you have is becoming unweildy now or you want more organization. In this scenario, you might be tempted to create your own Python library. The good news is Script Mode can support adding custom libraries and those libraries don't have to be in the same directory as your entry point Python script (SageMaker will copy the library folder to the same folder where the entrypoint is located).

In this example, we have a custom library to implement k-fold cross validation for an XGBoost model.

In [None]:
hyperparameters = {'num_round': 6, 'K': 5}

if enable_local_mode_training:
    train_instance_type = 'local'
    inputs = {'train': f'file://{train_dir}'}
else:
    train_instance_type = 'ml.c5.xlarge'
    inputs = {'train': csv_s3_uri}

estimator_parameters = {'entry_point':'train_deploy_xgboost_with_dependencies.py',
                        'source_dir': 'xgboost_script',
                        'dependencies': ['my_custom_library'],
                        'instance_type' : train_instance_type,
                        'instance_count': 1,
                        'hyperparameters': hyperparameters,
                        'role' : role,
                        'base_job_name':'xgboost-model',
                        'framework_version':'1.0-1',
                        'py_version':'py3'}

estimator = XGBoost(**estimator_parameters)
estimator.fit(inputs)

After we train the model with k-fold cross validation, we can deploy it to a SageMaker endpoint.

In [None]:
xgboost_predictor = estimator.deploy(initial_instance_count=1,
                                     instance_type='ml.m5.xlarge',
                                     endpoint_name='xgboost-endpoint')

Then you can use the endpoint to make predictions.

In [None]:
xgboost_predictor.serializer = CSVSerializer()
xgboost_predictor.deserializer = JSONDeserializer()
xgboost_predictor.predict(x_test[0])[0]

If you want to come back to this notebook after having already deployed the SageMaker endpoint, you can use the following snippet of code to invoke it.

In [None]:
xgboost_predictor = Predictor(endpoint_name='xgboost-endpoint',
                              sagemaker_session=sess,
                              serializer=CSVSerializer(),
                              deserializer=JSONDeserializer())

xgboost_predictor.predict(x_test[0])[0]

### Cleanup

In [None]:
sklearn_predictor.delete_endpoint(delete_endpoint_config=True)
pytorch_predictor.delete_endpoint(delete_endpoint_config=True)
xgboost_predictor.delete_endpoint(delete_endpoint_config=True)