## Launching a training job with the Python SDK

In [2]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.estimator import SKLearn

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

In [3]:
s3_bucket = 'readmission-data'
s3_key_prefix = 'processed-data'
s3_kms_id = '3a90a5d2-2ba8-4942-b9df-9a27ff7bf412'
s3_output_key_prefix = 'models'
s3_train_data = 's3://{}/{}/'.format(s3_bucket, s3_key_prefix)
s3_output_data = 's3://{}/{}/'.format(s3_bucket, s3_output_key_prefix)

print ('training data location:', s3_train_data)
print ('model output location', s3_output_data)
sklearn_model = SKLearn(entry_point='sklearn-train.py',
                            train_instance_type='ml.p3.2xlarge',
                            train_instance_count=1,
                            train_volume_size = 20,
                            framework_version='0.20.0',
                            role=role,
                            input_mode='File',
                            source_dir='train_scripts',
                            sagemaker_session=sagemaker_session,
                            output_path = s3_output_data,
                            output_kms_key = '3a90a5d2-2ba8-4942-b9df-9a27ff7bf412',
                            encrypt_inter_container_traffic = True,
                            hyperparameters = {'estimators': 20})
sklearn_model.fit({'train':s3_train_data})

training data location: s3://readmission-data/processed-data/
model output location s3://readmission-data/models/
2019-10-30 00:45:21 Starting - Starting the training job...
2019-10-30 00:45:33 Starting - Launching requested ML instances......
2019-10-30 00:46:32 Starting - Preparing the instances for training...
2019-10-30 00:47:18 Downloading - Downloading input data...............
2019-10-30 00:49:48 Training - Downloading the training image...
2019-10-30 00:50:07 Training - Training image download completed. Training in progress.[31m2019-10-30 00:50:07,117 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[31m2019-10-30 00:50:07,141 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[31m2019-10-30 00:50:07,472 sagemaker-containers INFO     Module sklearn-train does not provide a setup.py. [0m
[31mGenerating setup.py[0m
[31m2019-10-30 00:50:07,472 sagemaker-containers INFO     Generating setup.cfg[0m
[31m

[31mTraining input load complete[0m
[31mTest prevalence(n = 2294293): 0.08100185983220103[0m
[31mValid prevalence(n = 2294293): 0.08097788730558826[0m
[31mTrain all prevalence(n = 10706699): 0.08127257523537366[0m
[31mall samples (n = 15295285)[0m
[31mTrain prevalence (n = 1740322): 0.5[0m
[31mstarting to fit the model[0m
[31mModel fit complete[0m
[31mRF Accuracy 0.8566090209290611[0m
[31mroc score 0.8566192522759817[0m
[31mClassification Report[0m
              precision    recall  f1-score   support

           0       0.87      0.84      0.85    261187
           1       0.84      0.88      0.86    260910

   micro avg       0.86      0.86      0.86    522097
   macro avg       0.86      0.86      0.86    522097[0m
[31mweighted avg       0.86      0.86      0.86    522097
[0m
[31mConfusion matrix, without normalization[0m
[31m[[218701  42486]
 [ 32378 228532]][0m

2019-10-30 01:01:53 Uploading - Uploading generated training model[31mAverage precision-re


2019-10-30 01:02:38 Completed - Training job completed
Training seconds: 920
Billable seconds: 920


### Launching a tuning job with the Python SDK

In [6]:
# we use the Hyperparameter Tuner
from sagemaker.tuner import IntegerParameter

# Define exploration boundaries
hyperparameter_ranges = {
    'estimators': IntegerParameter(18, 20)}

# create Optimizer
Optimizer = sagemaker.tuner.HyperparameterTuner(
    estimator=sklearn_model,
    hyperparameter_ranges=hyperparameter_ranges,
    base_tuning_job_name='RF-tuner',
    objective_type='Maximize',
    objective_metric_name='RF Accuracy',
    metric_definitions=[
        {'Name': 'RF Accuracy',
         'Regex': 'RF Accuracy (\d\.\d+)'}],  # extract tracked metric from logs with regexp 
    max_jobs=2,
    max_parallel_jobs=2)

In [7]:
## Launch Hyperparameter training job 
Optimizer.fit({'train': s3_train_data})

In [9]:
## Get the status of Hyperparameter training job
import boto3
from sagemaker.tuner import HyperparameterTuner
boto3.client('sagemaker').describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=Optimizer.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']

'InProgress'

In [10]:
# get tuner results in a df
results = Optimizer.analytics().dataframe()
results.head()

Unnamed: 0,FinalObjectiveValue,TrainingElapsedTimeSeconds,TrainingEndTime,TrainingJobName,TrainingJobStatus,TrainingStartTime,estimators
0,0.85645,808.0,2019-10-30 01:21:15+00:00,RF-tuner-191030-0105-002-473b24c9,Completed,2019-10-30 01:07:47+00:00,18.0
1,0.855464,812.0,2019-10-30 01:21:15+00:00,RF-tuner-191030-0105-001-91f328f7,Completed,2019-10-30 01:07:43+00:00,19.0


In [None]:
## Creating a model
from sagemaker.mxnet.model import MXNetModel

sagemaker_model = MXNetModel(model_data='s3://path/to/model.tar.gz',
                             role='arn:aws:iam::accid:sagemaker-role',
                             entry_point='entry_point.py')

In [None]:
# Deploy my estimator to a SageMaker Endpoint and get a Predictor
#training_job_name = sklearn_estimator.latest_training_job.name
#training_job_name = "sagemaker-scikit-learn-2019-10-05-07-30-22-718"
#attached_estimator = SKLearn.attach(training_job_name)
#predictor = attached_estimator.deploy(instance_type='ml.m4.xlarge',initial_instance_count=1)

Batch transform training data

In [None]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer = sklearn_preprocessor.transformer(
    instance_count=1, 
    instance_type='ml.m4.xlarge',
    assemble_with = 'Line',
    accept = 'text/csv')

# Preprocess training input
transformer.transform(train_input, content_type='text/csv')
print('Waiting for transform job: ' + transformer.latest_transform_job.job_name)
transformer.wait()
preprocessed_train = transformer.output_path