## Run batch predictions using SageMaker Autopilot trained best model

### Imports 

In [1]:
from sagemaker import get_execution_role
from time import gmtime, strftime
from sagemaker import AutoML
import pandas as pd
import sagemaker
import logging
import boto3
import time

#### Setup logger

In [2]:
logger = logging.getLogger('__name__')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [3]:
logger.info(f'Using SageMaker version: {sagemaker.__version__}')
logger.info(f'Using Boto3 version: {boto3.__version__}')
logger.info(f'Using Pandas version: {pd.__version__}')

Using SageMaker version: 2.70.0
Using Boto3 version: 1.20.23
Using Pandas version: 1.0.1


### Setup

#### Assign S3 location to park our datasets

In [4]:
bucket = sagemaker.Session().default_bucket()
prefix = 'customer-churn-prediction'
region = 'us-east-1'

#### Set your batch input and output S3 locations

In [5]:
batch_input = f's3://{bucket}/{prefix}/batch_input/'
logger.info(f'Batch input S3 location: {batch_input}')

Batch input S3 location: s3://sagemaker-us-east-1-119174016168/customer-churn-prediction/batch_input/


In [6]:
batch_output = f's3://{bucket}/{prefix}/batch_output/'
logger.info(f'Batch output S3 location: {batch_output}')

Batch output S3 location: s3://sagemaker-us-east-1-119174016168/customer-churn-prediction/batch_output/


#### Create clients and resources 

In [7]:
session = boto3.Session()
region = session.region_name
sagemaker_client = session.client('sagemaker', region_name=region)
s3 = boto3.resource('s3')

### Interact with your previously run Autopilot Experiment 

#### Create automl instance using previously created Autopilot experiment name

In [8]:
# autopilot_experiment_name = '<ENTER YOUR AUTOPILOT EXPERIMENT NAME HERE>'
autopilot_experiment_name = 'churn-prediction-experiment'
automl = AutoML.attach(auto_ml_job_name=autopilot_experiment_name)

#### Define inference response keys


Valid inference response keys can be set based on the problem type - binary classification or multiclass classification.

* `predicted_label` - Predicted class 

* `probability` - In binary classification, the probability that the result is predicted as the second or True class in the target column. In multiclass classification, the probability of the winning class.
    
* `labels` - List of all possible classes 

* `probabilities` - List of all probabilities for all classes (order corresponds with 'labels')

In [9]:
inference_response_keys = ['predicted_label', 'probability']

#### Get best candidate model details using the automl created above

In [10]:
best_candidate = automl.describe_auto_ml_job()['BestCandidate']
best_candidate_name = best_candidate['CandidateName']
logger.info(f'Best candidate name: {best_candidate_name}')

Best candidate name: churn-prediction-experimentGqPsk-126-3e37f694


In [11]:
logger.info(f"Objective metric name: {best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName']}")
logger.info(f"Objective metric value: {best_candidate['FinalAutoMLJobObjectiveMetric']['Value']}")

Objective metric name: validation:binary_f_beta
Objective metric value: 0.9682851433753967


#### Recreate the best model using the identified candidate name 

In [12]:
model = automl.create_model(name=best_candidate_name, 
                            candidate=best_candidate, 
                            inference_response_keys=inference_response_keys)

### Make batch predictions using the Autopilot model

#### Create a transformer using the above re-created model to run batch predictions 

In [13]:
transformer = model.transformer(instance_count=1, 
                                instance_type='ml.m5.xlarge', 
                                assemble_with='Line', 
                                output_path=batch_output)

Using already existing model: churn-prediction-experimentGqPsk-126-3e37f694


#### Kick-off the batch predictions job

In [14]:
transformer.transform(data=batch_input, 
                      split_type='Line', 
                      content_type='text/csv', 
                      wait=False)
transformer_current_job_name = transformer._current_job_name

In [15]:
logger.info(f'Running batch predictions job using SageMaker Batch Transform: {transformer_current_job_name}')

Running batch predictions job using SageMaker Batch Transform: churn-prediction-experimentGqPsk-126-3e-2022-02-23-17-53-25-199


#### Check the status of the running job

In [16]:
response = sagemaker_client.describe_transform_job(TransformJobName=transformer_current_job_name)
status = response['TransformJobStatus']
logger.info(f'Job status: {status}')

while status not in ('Failed', 'Completed', 'Stopped'):
    response = sagemaker_client.describe_transform_job(TransformJobName=transformer_current_job_name)
    status = response['TransformJobStatus']
    logger.info(f'Job status: {status}')
    time.sleep(30)

Job status: InProgress
Job status: InProgress
Job status: InProgress
Job status: InProgress
Job status: InProgress
Job status: InProgress
Job status: InProgress
Job status: InProgress
Job status: InProgress
Job status: InProgress
Job status: InProgress
Job status: InProgress
Job status: InProgress
Job status: InProgress
Job status: Completed


### Inspect the results of the batch job

In [17]:
s3_output_key = f'{prefix}/batch_output/unlabeled.csv.out'
local_file_save_path = './results.csv'
results_bucket = s3.Bucket(bucket)
results_bucket.download_file(s3_output_key, local_file_save_path)

In [18]:
results_df = pd.read_csv(local_file_save_path, sep=',', names=['predicted_label', 'predicted_probability']) 
results_df

Unnamed: 0,predicted_label,predicted_probability
0,False.,0.000174
1,False.,0.001264
2,True.,0.997513
3,False.,0.000998
4,True.,0.999312
...,...,...
495,True.,0.999278
496,True.,0.997822
497,False.,0.000278
498,True.,0.914709
