# Breast Cancer Prediction

## Load & Prepare Data

In [None]:
import sagemaker
import boto3
import os

In [None]:
role = sagemaker.get_execution_role()
sess = sagemaker.Session()

bucket = sess.default_bucket()
prefix = 'breast-cancer-prediction-xgboost'

In [None]:
import pandas as pd

raw_df = pd.read_csv('./DATA/data.csv')

In [None]:
raw_df.head()

In [None]:
raw_df.shape

<b> IMPORTANT</b>
<p>1/ Remove column <b>id</b> and binary encode column <b> diagnosis </b></p>
<p>2/ Bring target column <i>(y)</i> to the front (first column) if using SageMaker built-in algorithms </p>

In [None]:
train_df = pd.read_csv('./DATA/train.csv')

In [None]:
train_df.head()

In [None]:
# Upload data files to S3
sess.upload_data('./DATA/train.csv', key_prefix='{}/train'.format(prefix))
sess.upload_data('./DATA/validation.csv', key_prefix='{}/validation'.format(prefix))
sess.upload_data('./DATA/batch.csv', key_prefix='{}/batch'.format(prefix))
sess.upload_data('./DATA/batch_no_id.csv', key_prefix='{}/batch'.format(prefix))

## Train XGBoost Binary Classifier using SageMaker XGBoost 

The below cell uses the [SageMaker Python SDK](https://github.com/aws/sagemaker-python-sdk) to kick off the training job using both our training set and validation set. Not that the objective is set to 'binary:logistic' which trains a model to output a probability between 0 and 1 (here the probability of a tumor being malignant).

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri
from time import gmtime, strftime

In [None]:
job_name = 'xgb-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
output_location = 's3://{}/{}/output/{}'.format(bucket, prefix, job_name)
docker_image_uri = get_image_uri(boto3.Session().region_name, 'xgboost', '1.0-1')
print('Output Location: {}'.format(output_location))
print('Docker Image URI: {}'.format(docker_image_uri))

In [None]:
estimator = sagemaker.estimator.Estimator(docker_image_uri,
                                          role,
                                          train_instance_count=2,
                                          train_instance_type='ml.m5.xlarge',
                                          train_volume_size=50,
                                          input_mode='File',
                                          output_path=output_location,
                                          sagemaker_session=sess)

In [None]:
estimator.set_hyperparameters(objective="binary:logistic",
                                 max_depth=5,
                                 eta=0.2,
                                 gamma=4,
                                 min_child_weight=6,
                                 subsample=0.8,
                                 silent=0,
                                 num_round=100)

In [None]:
estimator.__dict__

In [None]:
# create data channels
train_data = sagemaker.session.s3_input('s3://{}/{}/train'.format(bucket, prefix), 
                                        distribution='FullyReplicated', 
                                        content_type='text/csv', 
                                        s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input('s3://{}/{}/validation'.format(bucket, prefix), 
                                             distribution='FullyReplicated', 
                                             content_type='text/csv', 
                                             s3_data_type='S3Prefix')
data_channels = {'train': train_data, 'validation': validation_data}

In [None]:
# start training by calling the fit method in the estimator
estimator.fit(inputs=data_channels, logs=True)

## Deploy Trained Model as SageMaker Endpoint

In [None]:
predictor = estimator.deploy(instance_type='ml.m5.xlarge', initial_instance_count=2)

## Test Deployed Model Endpoint for Inference

In [None]:
batch_no_id_df = pd.read_csv('./DATA/batch_no_id.csv')
test_data = batch_no_id_df.values.tolist()

In [None]:
test_data[0]

In [None]:
X = ','.join([str(x) for x in test_data[0]])
X

In [None]:
# Create Sagemaker run-time client using boto3
client = boto3.client('sagemaker-runtime')
endpoint_name = predictor.endpoint

In [None]:
response = client.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=X,
    ContentType='text/csv')
y = response['Body'].read()
y = y.decode('utf-8')
y 

## Create Batch Transform Job

In [None]:
input_location = 's3://{}/{}/batch/{}'.format(bucket, prefix, 'batch_no_id.csv') 
s3_batch_output = 's3://{}/{}/batch/batch-inference'.format(bucket, prefix)

In [None]:
transformer = estimator.transformer(instance_count=2, 
                                    instance_type='ml.m4.xlarge',
                                    output_path=s3_batch_output) 
transformer.transform(data=input_location, 
                      data_type='S3Prefix', 
                      content_type='text/csv', 
                      split_type='Line')
transformer.wait()

## Inspect Output of Batch Transform Job
<p>It should show the list probabilities of tumors being malignant</p>

In [None]:
from urllib.parse import urlparse
import json
import io

In [None]:
def get_csv_output_from_s3(s3uri, file_name):
    parsed_url = urlparse(s3uri)
    bucket_name = parsed_url.netloc
    prefix = parsed_url.path[1:]
    s3 = boto3.resource('s3')
    obj = s3.Object(bucket_name, '{}/{}'.format(prefix, file_name))
    return obj.get()["Body"].read().decode('utf-8')    

In [None]:
output = get_csv_output_from_s3(transformer.output_path, '{}.out'.format('batch_no_id.csv'))
output_df = pd.read_csv(io.StringIO(output), sep=',', header=None)
output_df.head()

In [None]:
output_df.shape