# Analyze Heart Disease with Amazon SageMaker XGBoost
---

In [149]:
# cell 00 .. install and setup dependent libraries and SDKs for Jupyter Notebook
%pip install sagemaker-experiments

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [151]:
# cell 01
import sagemaker

import boto3
from sagemaker import get_execution_role

region = boto3.Session().region_name

session = sagemaker.Session()
s3_bucket = session.default_bucket()
s3_prefix = 'sagemaker/heartdisease/xgboost'
experiment_prefix = 'sm-heart-xgb-exp'

role = get_execution_role()

sm = boto3.Session().client(service_name='sagemaker',region_name=region)
s3 = boto3.Session().resource('s3')

print(s3_bucket)

sagemaker-us-east-2-645411899653


In [152]:
# cell 02 ... ETL
s3_remote_path = 'export-flow-01-21-03-50-29d28f75/output/data-wrangler-flow-processing-01-21-03-50-29d28f75/f7636a34-c66f-4297-b046-014d74e546db/default/part-00000-fc4d306a-9bd5-461f-8b5e-8c9124538d13-c000.csv'
sm_local_path = 'heartfailure_data_pakistan_full.csv'

# download file from remote to local
s3.Bucket(s3_bucket).download_file( s3_remote_path, sm_local_path)

Read the data into a Pandas data frame and take a look.

In [153]:
# cell 03 ... preview input data frame
import pandas as pd

data = pd.read_csv(sm_local_path)
pd.set_option('display.max_columns', 50)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 50)         # Keep the output on one page
data

Unnamed: 0,target_heartfailure,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking
0,1,75,0,582,0,20,1,265000,1.9,130,1,0
1,1,55,0,7861,0,38,0,263358,1.1,136,1,0
2,1,65,0,146,0,20,0,162000,1.3,129,1,1
3,1,50,1,111,0,20,0,210000,1.9,137,1,0
4,1,65,1,160,1,20,0,327000,2.7,116,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
294,0,62,0,61,1,38,1,155000,1.1,143,1,1
295,0,55,0,1820,0,38,0,270000,1.2,139,0,0
296,0,45,0,2060,1,60,0,742000,0.8,138,0,0
297,0,45,0,2413,0,38,0,140000,1.4,140,1,1


Note that there are 20 features to help predict the target column 'y'.

Amazon SageMaker Autopilot takes care of preprocessing your data for you. You do not need to perform conventional data preprocssing techniques such as handling missing values, converting categorical features to numeric features, scaling data, and handling more complicated data types.

Moreover, splitting the dataset into training and validation splits is not necessary. Autopilot takes care of this for you. You may, however, want to split out a test set. That's next, although you use it for batch inference at the end instead of testing the model.


### Reserve some data for calling batch inference on the model

Divide the data into training and testing splits. The training split is used by SageMaker Autopilot. The testing split is reserved to perform inference using the suggested model.


In [154]:
# cell 04 ... split data set into training and test subsets
train_data = data.sample(frac=0.8,random_state=200)

test_data = data.drop(train_data.index)

# todo ... split 70 / 15 / 15 so that the last subset is for inference

#test_data_no_target = test_data.drop(columns=['target_heartfailure'])

### Upload the dataset to Amazon S3
Copy the file to Amazon Simple Storage Service (Amazon S3) in a .csv format for Amazon SageMaker training to use.

In [155]:
# cell 05 .. publish training and test subsets of data to S3
train_file = 'train_data.csv';
train_data.to_csv(train_file, index=False, header=False)
train_data_s3_path = session.upload_data(path=train_file, key_prefix=s3_prefix + "/train")
print('Train data uploaded to: ' + train_data_s3_path)

test_file = 'test_data.csv';
test_data.to_csv(test_file, index=False, header=False)
test_data_s3_path = session.upload_data(path=test_file, key_prefix=s3_prefix + "/test")
print('Test data uploaded to: ' + test_data_s3_path)

Train data uploaded to: s3://sagemaker-us-east-2-645411899653/sagemaker/heartdisease/xgboost/train/train_data.csv
Test data uploaded to: s3://sagemaker-us-east-2-645411899653/sagemaker/heartdisease/xgboost/test/test_data.csv


## Launching the SageMaker HyperParameter Tuning Job<a name="Launching"></a>

In [156]:
# cell 06 ... create parent experiment to associate with HPO tuning job

import time
from time import strftime

import smexperiments
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

create_date = strftime("%Y-%m-%d-%H-%M-%S")
experiment_name = 'sm-heart-xgb-exp-{}'.format(create_date)
trial_name = 'sm-heart-xgb-trial-{}'.format(create_date)

# experiment
try:
    experiment = Experiment.load(experiment_name = experiment_name)
except Exception as ex:
    if "ResourceNotFound" in str(ex):
        experiment = Experiment.create(experiment_name = experiment_name, 
                                       description = "SageMaker XGBoost experiment", 
                                       tags = [{'Key': 'Experiment', 'Value': experiment_name}])
# trial

try:
    trial = Trial.load(trial_name)
except Exception as ex:
    if "ResourceNotFound" in str(ex):
        trial = Trial.create(experiment_name=experiment_name, trial_name=trial_name)
        
print(experiment_name)
print(trial_name)

sm-heart-xgb-exp-2021-11-05-17-21-07
sm-heart-xgb-trial-2021-11-05-17-21-07


In [157]:
# cell 07
# get algo container [class]
container = sagemaker.image_uris.retrieve(region=boto3.Session().region_name, framework='xgboost', version='latest')
# setup estimator for algo [tbt running instance of algo class]
xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(s3_bucket, s3_prefix),
                                    sagemaker_session=session,
                                    tags = [{'Key':'Experiment','Value':experiment_name},{'Key':'Trial','Value':trial_name}])
# setup training and test/validation channels
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(s3_bucket, s3_prefix), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/test/'.format(s3_bucket, s3_prefix), content_type='csv')
# setup metric
objective_metric_name = 'validation:auc'
print('Defined ML model estimator for {}'.format(experiment_name))

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


Defined ML model estimator for sm-heart-xgb-exp-2021-11-05-17-21-07


In [158]:
#cell 08 ... HPO.Job.Run()

from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

xgb.set_hyperparameters( num_round = 100)

hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                            'min_child_weight': ContinuousParameter(1, 10),
                            'alpha': ContinuousParameter(0, 2),
                            'max_depth': IntegerParameter(1, 10)}

tuner = HyperparameterTuner(xgb,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=5,
                            max_parallel_jobs=3,
                            base_tuning_job_name = 'sm-heart-xgb-exp',
                           tags = [{'Key':'Experiment','Value':experiment_name},{'Key':'Trial','Value':trial_name}])

print('Defined ML model HPO job for {}'.format(experiment_name))

# job.Run()
tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating hyperparameter tuning job with name: sm-heart-xgb-exp-211105-1729


Defined ML model HPO job for sm-heart-xgb-exp-2021-11-05-17-21-07
...........................................................................................!


### Perform batch inference using the best candidate

Now that you have successfully completed the SageMaker HyperParameterTuner job on the dataset, create a model from any of the candidates by using [Inference Pipelines](https://docs.aws.amazon.com/sagemaker/latest/dg/inference-pipelines.html). 

In [159]:
# cell09 ... associate HPO Job instances with Trials in parent Experiment

import time
from datetime import timezone
from smexperiments.search_expression import Filter, Operator, SearchExpression

# get the most recently created tuning job

list_tuning_jobs_response = sm.list_hyper_parameter_tuning_jobs(
    SortBy="CreationTime", SortOrder="Descending"
)
print(f'Found {len(list_tuning_jobs_response["HyperParameterTuningJobSummaries"])} tuning jobs.')
tuning_jobs = list_tuning_jobs_response["HyperParameterTuningJobSummaries"]
most_recently_created_tuning_job = tuning_jobs[0]

creation_time = most_recently_created_tuning_job["CreationTime"]
creation_time = creation_time.astimezone(timezone.utc)
creation_time = creation_time.strftime("%Y-%m-%dT%H:%M:%SZ")

created_after_filter = Filter(
    name="CreationTime",
    operator=Operator.GREATER_THAN_OR_EQUAL,
    value=str(creation_time),
)
source_arn_filter = Filter(
    name="TrialComponentName", operator=Operator.CONTAINS, value='sm-heart-xgb-exp'
)
source_type_filter = Filter(
    name="Source.SourceType", operator=Operator.EQUALS, value="SageMakerTrainingJob"
)

search_expression = SearchExpression(
    filters=[created_after_filter, source_arn_filter, source_type_filter]
)

# search for related training trials
trial_component_search_results = list(
    TrialComponent.search(search_expression=search_expression, sagemaker_boto_client=sm)
)
print(f"Found {len(trial_component_search_results)} trial components.")

try:
    trial = Trial.load(trial_name)
except Exception as ex:
    if "ResourceNotFound" in str(ex):
        trial = Trial.create(experiment_name=experiment_name, trial_name=trial_name)

for tc in trial_component_search_results:
    print(f"Associating trial component {tc.trial_component_name} with trial {trial.trial_name}.")
    trial.add_trial_component(tc.trial_component_name)
    # sleep to avoid throttling
    time.sleep(1)

Found 10 tuning jobs.
Found 5 trial components.
Associating trial component sm-heart-xgb-exp-211105-1729-004-4b4be2c6-aws-training-job with trial sm-heart-xgb-trial-2021-11-05-17-21-07.
Associating trial component sm-heart-xgb-exp-211105-1729-005-1641e28a-aws-training-job with trial sm-heart-xgb-trial-2021-11-05-17-21-07.
Associating trial component sm-heart-xgb-exp-211105-1729-003-73a579da-aws-training-job with trial sm-heart-xgb-trial-2021-11-05-17-21-07.
Associating trial component sm-heart-xgb-exp-211105-1729-002-e419cdc2-aws-training-job with trial sm-heart-xgb-trial-2021-11-05-17-21-07.
Associating trial component sm-heart-xgb-exp-211105-1729-001-e8019900-aws-training-job with trial sm-heart-xgb-trial-2021-11-05-17-21-07.


In [160]:
# cell 09 ... select best training job per the eval metric

# HPO.model.select
tuner.best_training_job()
# HPO.model.deploy
tuner_predictor = tuner.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge', serializer=sagemaker.serializers.CSVSerializer())


2021-11-05 17:32:54 Starting - Preparing the instances for training
2021-11-05 17:32:54 Downloading - Downloading input data
2021-11-05 17:32:54 Training - Training image download completed. Training in progress.
2021-11-05 17:32:54 Uploading - Uploading generated training model
2021-11-05 17:32:54 Completed - Training job completed

INFO:sagemaker:Creating model with name: sm-heart-xgb-exp-2021-11-05-18-25-51-560





INFO:sagemaker:Creating endpoint with name sm-heart-xgb-exp-211105-1729-002-e419cdc2


-----!

In [None]:
# cell 10 ... evaluate trained and tuned model with new validation data
# visualize experiments


In [None]:
# cell zzz ... cleanup
# delete endpoint
# delete model
# delete s3 bucket