#### Imports 

In [1]:
from time import gmtime, strftime, sleep
from sagemaker import AutoML
import pandas as pd
import sagemaker 
import logging
import boto3

##### Setup logger

In [2]:
logger = logging.getLogger('__name__')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

#### Essentials

In [3]:
bucket = sagemaker.Session().default_bucket()
prefix = 'diabetic-readmission-prediction'
region = 'us-east-1'

In [4]:
batch_input = f's3://{bucket}/{prefix}/batch_input/'
batch_input

's3://sagemaker-us-east-1-119174016168/diabetic-readmission-prediction/batch_input/'

In [5]:
batch_output = f's3://{bucket}/{prefix}/batch_output/'
batch_output

's3://sagemaker-us-east-1-119174016168/diabetic-readmission-prediction/batch_output/'

In [6]:
current_timestamp = strftime('%Y-%m-%d-%H-%M-%S', gmtime())

In [7]:
session = boto3.Session()
sagemaker_session = sagemaker.session.Session()
sagemaker_client = boto3.client('sagemaker', region_name=region)
s3_client = boto3.client('s3')

In [8]:
transform_job_name = f'autopilot-batch-job-{current_timestamp}'

#### Copy batch input data from local to S3

In [9]:
!aws s3 cp .././data/diabetic_readmission_unlabeled.csv {batch_input}

upload: ../data/diabetic_readmission_unlabeled.csv to s3://sagemaker-us-east-1-119174016168/diabetic-readmission-prediction/batch_input/diabetic_readmission_unlabeled.csv


#### Recreate best model from SageMaker Autopilot

In [10]:
auto_ml_job_name = 'diabetic-readmission-prediction'  # Copy this from the console
aml = AutoML.attach(auto_ml_job_name=auto_ml_job_name)

In [11]:
best_candidate = sagemaker_client.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)["BestCandidate"]
best_candidate_name = best_candidate["CandidateName"]
logger.info(f"CandidateName: {best_candidate_name}")
logger.info(f'FinalAutoMLJobObjectiveMetricName: {best_candidate["FinalAutoMLJobObjectiveMetric"]["MetricName"]}')
logger.info(f'FinalAutoMLJobObjectiveMetricValue: {best_candidate["FinalAutoMLJobObjectiveMetric"]["Value"]}')

CandidateName: diabetic-readmission-predictioeS-187-52cde9d0
FinalAutoMLJobObjectiveMetricName: validation:macro_f_beta
FinalAutoMLJobObjectiveMetricValue: 0.4098981022834778


In [12]:
aml_best_model = aml.create_model(name=best_candidate_name,
                                  candidate=None,
                                  inference_response_keys=['probabilities', 'labels'])

#### Create Batch Transform job

In [13]:
aml_transformer = aml_best_model.transformer(accept='text/csv', 
                                             assemble_with='Line', 
                                             instance_type='ml.m5.xlarge', 
                                             instance_count=1,)

Using already existing model: diabetic-readmission-predictioeS-187-52cde9d0


In [14]:
aml_transformer.transform(batch_input,
                          content_type='text/csv',
                          split_type='Line',
                          job_name=transform_job_name,
                          wait=False)

#### Check the status of the running Batch Transform job

In [15]:
logger.info("[JobStatus]")

describe_response = sagemaker_client.describe_transform_job(TransformJobName=transform_job_name)
job_run_status = describe_response["TransformJobStatus"]
logger.info(job_run_status)

while job_run_status not in ("Failed", "Completed", "Stopped"):
    describe_response = sagemaker_client.describe_transform_job(TransformJobName=transform_job_name)
    job_run_status = describe_response["TransformJobStatus"]
    logger.info(job_run_status)
    sleep(30)

[JobStatus]
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
Completed


#### Download the output of the batch transform job from S3 to local

In [16]:
s3_output_key = f"{transform_job_name}/diabetic_readmission_unlabeled.csv.out"
local_inference_results_path = ".././data/inference_results_with_probabilities.csv"

s3 = boto3.resource("s3")

inference_results_bucket = s3.Bucket(sagemaker_session.default_bucket())

inference_results_bucket.download_file(s3_output_key, local_inference_results_path)

#### Inspect the results (prediction labels and probabilities)

In [17]:
data = pd.read_csv(local_inference_results_path, sep=",", names=['Probabilities', 'Labels'])
pd.set_option("display.max_rows", 10)  
data

Unnamed: 0,Probabilities,Labels
0,"[0.18437457084655762, 0.33086180686950684, 0.4...","['<30', '>30', 'no']"
1,"[0.1948574036359787, 0.293072372674942, 0.5120...","['<30', '>30', 'no']"
2,"[0.27881544828414917, 0.4338899850845337, 0.28...","['<30', '>30', 'no']"
3,"[0.297400563955307, 0.328758180141449, 0.37384...","['<30', '>30', 'no']"
4,"[0.3162025511264801, 0.3914549946784973, 0.292...","['<30', '>30', 'no']"
