# Analyze Heart Disease with Amazon SageMaker XGBoost
---

In [9]:
# cell 00 .. install and setup dependent libraries and SDKs for Jupyter Notebook
%pip install sagemaker-experiments

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Note: you may need to restart the kernel to use updated packages.


In [10]:
# cell 01
import sagemaker

import boto3
from sagemaker import get_execution_role

region = boto3.Session().region_name

# https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html
# https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost_hyperparameters.html
session = sagemaker.Session()
s3_bucket = session.default_bucket()
s3_data_prefix = 'sagemaker/heartdisease/data/'
s3_model_prefix = 'sagemaker/heartdisease/xgboost'
s3_output_path = 's3://{}/{}/output'.format(s3_bucket, s3_model_prefix)
algorithm = 'xgboost'
trial_prefix = 'sm-heart-xgb-trial'

role = get_execution_role()

sm = boto3.Session().client(service_name='sagemaker',region_name=region)
s3 = boto3.Session().resource('s3')

print(s3_bucket)

sagemaker-us-east-2-645411899653


In [11]:
# cell 02 ... ETL
s3_remote_path = s3_data_prefix + 'heart_failure_clinical_records_data-02-processed.csv'
sm_local_path = 'heart_failure_clinical_records_data-02-processed.csv'

# download file from remote to local
s3.Bucket(s3_bucket).download_file( s3_remote_path, sm_local_path)

In [12]:
# cell 03 ... preview input data frame ... 299 rows x 12 columns
import pandas as pd

data = pd.read_csv(sm_local_path)
pd.set_option('display.max_columns', 50)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 50)         # Keep the output on one page
data

Unnamed: 0,target_heart_failure,sex,smoking,diabetes,anaemia,high_blood_pressure,age,creatinine_phosphokinase,ejection_fraction,platelets,serum_creatinine,serum_sodium
0,1,1,0,0,0,1,1.192945,0.000166,-1.530560,1.681648e-02,0.490057,-1.504036
1,1,1,0,0,0,0,-0.491279,7.514640,-0.007077,7.535660e-09,-0.284552,-0.141976
2,1,1,1,0,0,0,0.350833,-0.449939,-1.530560,-1.038073e+00,-0.090900,-1.731046
3,1,1,0,0,1,0,-0.912335,-0.486071,-1.530560,-5.464741e-01,0.490057,0.085034
4,1,0,0,1,1,0,0.350833,-0.435486,-1.530560,6.517986e-01,1.264666,-4.682176
...,...,...,...,...,...,...,...,...,...,...,...,...
294,0,1,1,1,0,1,0.098199,-0.537688,-0.007077,-1.109765e+00,-0.284552,1.447094
295,0,0,0,0,0,0,-0.491279,1.278215,-0.007077,6.802472e-02,-0.187726,0.539054
296,0,0,0,1,0,0,-1.333392,1.525979,1.854958,4.902082e+00,-0.575031,0.312044
297,0,1,1,0,0,0,-1.333392,1.890398,-0.007077,-1.263389e+00,0.005926,0.766064


In [13]:
# cell 04 ... split data set into training and test subsets
train_data = data.sample(frac=0.8,random_state=200)

test_data = data.drop(train_data.index)

In [54]:
# cell 05 .. publish training and test subsets of data to S3
train_file = 'train_data.csv';
train_data.to_csv(train_file, index=False, header=False)
train_data_s3_path = session.upload_data(path=train_file, key_prefix=s3_model_prefix + "/train")
print('Train data uploaded to: ' + train_data_s3_path)

test_file = 'test_data.csv';
test_data.to_csv(test_file, index=False, header=False)
test_data_s3_path = session.upload_data(path=test_file, key_prefix=s3_model_prefix + "/test")
print('Test data uploaded to: ' + test_data_s3_path)

Train data uploaded to: s3://sagemaker-us-east-2-645411899653/sagemaker/heartdisease/xgboost/train/train_data.csv
Test data uploaded to: s3://sagemaker-us-east-2-645411899653/sagemaker/heartdisease/xgboost/test/test_data.csv


## Launching the SageMaker HyperParameter Tuning Job<a name="Launching"></a>

In [14]:
# cell 06 ... create parent experiment to associate with HPO tuning job

import time
from time import strftime

import smexperiments
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

create_date = strftime("%Y-%m-%d-%H-%M-%S")
experiment_prefix = 'sm-heart-exp' 
experiment_name = 'sm-heart-exp-{}'.format(create_date)
trial_name = '{}-{}'.format(trial_prefix, create_date)

# experiment
try:
    experiment = Experiment.load(experiment_name = experiment_name)
except Exception as ex:
    if "ResourceNotFound" in str(ex):
        experiment = Experiment.create(experiment_name = experiment_name, 
                                       description = "SageMaker Heart Disease experiment", 
                                       tags = [{'Key': 'Experiment', 'Value': experiment_name}])
# trial

try:
    trial = Trial.load(trial_name)
except Exception as ex:
    if "ResourceNotFound" in str(ex):
        trial = Trial.create(experiment_name=experiment_name, trial_name=trial_name)
        
print(experiment_name)
print(trial_name)

sm-heart-exp-2021-12-06-19-06-19
sm-heart-xgb-trial-2021-12-06-19-06-19


In [16]:
# cell 07
from sagemaker.debugger import rule_configs, Rule, DebuggerHookConfig, CollectionConfig

# get algo container [class]
container = sagemaker.image_uris.retrieve(region=boto3.Session().region_name, framework=algorithm, version='latest')
# setup estimator for algo [tbt running instance of algo class]
ml_model = sagemaker.estimator.Estimator(                
                container,
                role, 
                instance_count=1, 
                instance_type='ml.m4.xlarge',
                output_path=s3_output_path,
                sagemaker_session=session,
                tags = [{'Key':'Experiment','Value':experiment_name},{'Key':'Trial','Value':trial_name}],
                debugger_hook_config=DebuggerHookConfig(
                    s3_output_path = s3_output_path,
                    collection_configs = [
                        CollectionConfig(name="metrics",parameters={"save_interval":"5"}),
                        CollectionConfig(name="predictions",parameters={"save_interval":"5"}),
                        CollectionConfig(name="feature_importance",parameters={"save_interval":"5"}),
                        CollectionConfig(name="average_shap", parameters={"save_interval":"5"})
                    ]
                )                  
)

# setup training and test/validation channels
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(s3_bucket, s3_model_prefix), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/test/'.format(s3_bucket, s3_model_prefix), content_type='csv')
print('Defined ML model estimator for {}'.format(trial_name))

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


Defined ML model estimator for sm-heart-xgb-trial-2021-12-06-19-06-19


In [17]:
#cell 08 ... HPO.Job.Run()

from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

ml_model.set_hyperparameters( num_round = 100)

# define hyperparameter ranges
# https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost-tuning.html ... alpha, min_child_weight, eta, num_round
hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                            'min_child_weight': ContinuousParameter(1, 10),
                            'alpha': ContinuousParameter(0, 100),
                            'max_depth': IntegerParameter(1, 10)}

# define metric ... F1 = harmonic mean of precision and recall
objective_metric_name = 'validation:auc'

# define HPO job
ml_tuner = HyperparameterTuner(ml_model,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=2,
                            max_parallel_jobs=2,
                            base_tuning_job_name = trial_prefix,
                           tags = [{'Key':'Experiment','Value':experiment_name},{'Key':'Trial','Value':trial_name}])

print('Defined ML model HPO job for {}'.format(trial_prefix))

# job.Run()
ml_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating hyperparameter tuning job with name: sm-heart-xgb-trial-211206-1907


Defined ML model HPO job for sm-heart-xgb-trial
.............................................!


In [18]:
# cell09 ... associate HPO Job instances with Trials in parent Experiment

import time
from datetime import timezone
from smexperiments.search_expression import Filter, Operator, SearchExpression

# get the most recently created tuning job

list_tuning_jobs_response = sm.list_hyper_parameter_tuning_jobs(
    SortBy="CreationTime", SortOrder="Descending"
)
print(f'Found {len(list_tuning_jobs_response["HyperParameterTuningJobSummaries"])} tuning jobs.')
tuning_jobs = list_tuning_jobs_response["HyperParameterTuningJobSummaries"]
most_recently_created_tuning_job = tuning_jobs[0]

creation_time = most_recently_created_tuning_job["CreationTime"]
creation_time = creation_time.astimezone(timezone.utc)
creation_time = creation_time.strftime("%Y-%m-%dT%H:%M:%SZ")

created_after_filter = Filter(
    name="CreationTime",
    operator=Operator.GREATER_THAN_OR_EQUAL,
    value=str(creation_time),
)
source_arn_filter = Filter(
    name="TrialComponentName", operator=Operator.CONTAINS, value=trial_prefix
)
source_type_filter = Filter(
    name="Source.SourceType", operator=Operator.EQUALS, value="SageMakerTrainingJob"
)

search_expression = SearchExpression(
    filters=[created_after_filter, source_arn_filter, source_type_filter]
)

# search for related training trials
trial_component_search_results = list(
    TrialComponent.search(search_expression=search_expression, sagemaker_boto_client=sm)
)
print(f"Found {len(trial_component_search_results)} trial components.")

try:
    trial = Trial.load(trial_name)
except Exception as ex:
    if "ResourceNotFound" in str(ex):
        trial = Trial.create(experiment_name=experiment_name, trial_name=trial_name)

for tc in trial_component_search_results:
    print(f"Associating trial component {tc.trial_component_name} with trial {trial.trial_name}.")
    trial.add_trial_component(tc.trial_component_name)
    # sleep to avoid throttling
    time.sleep(1)

Found 10 tuning jobs.
Found 2 trial components.
Associating trial component sm-heart-xgb-trial-211206-1907-002-5d59fd5a-aws-training-job with trial sm-heart-xgb-trial-2021-12-06-19-06-19.
Associating trial component sm-heart-xgb-trial-211206-1907-001-9bb588ee-aws-training-job with trial sm-heart-xgb-trial-2021-12-06-19-06-19.


In [19]:
# cell 10 ... select best training job per the eval metric
# HPO.model.select
ml_tuner.best_training_job()
# HPO.model.deploy
ml_tuner_predictor = ml_tuner.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge', serializer=sagemaker.serializers.CSVSerializer())


2021-12-06 19:11:04 Starting - Preparing the instances for training
2021-12-06 19:11:04 Downloading - Downloading input data
2021-12-06 19:11:04 Training - Training image download completed. Training in progress.
2021-12-06 19:11:04 Uploading - Uploading generated training model
2021-12-06 19:11:04 Completed - Training job completed

INFO:sagemaker:Creating model with name: sm-heart-xgb-trial-2021-12-06-19-11-46-408





INFO:sagemaker:Creating endpoint with name sm-heart-xgb-trial-211206-1907-002-5d59fd5a


------!

In [20]:
# cell 11 ... evaluate best model performance
test_features = test_data.drop(['target_heart_failure'], axis=1)
test_labels = test_data['target_heart_failure']

In [29]:
# cell 12 .. calculate predictions for XGBoost
import numpy as np
import json

def predict(predictor, data, rows=500, verbose=False):
    predictions = ''
    for row in data:
        if (verbose):
            print(row)
        prediction = predictor.predict(row, initial_args={"ContentType":"text/csv"})
        if (verbose):
            print(prediction)
        predictions = ','.join([predictions, prediction.decode('utf-8')])

#    arrays = np.array_split(data[0], 100)
#    predictions = ''
#    for array in arrays:
#        prediction = predictor.predict(array, initial_args={"ContentType":"text/csv"})
#        predictions = ','.join([predictions, prediction.decode('utf-8')])
        
    return np.fromstring(predictions[1:],sep=',')


In [32]:
# cell 13 ... analyze confusion matrix on the test data set
result = ml_tuner_predictor.predict([1,0,0,0,0,-0.491279276,7.514639529,-0.00707675,7.54E-09,-0.284552352,-0.141976151], initial_args={"ContentType":"text/csv"})
print(result)

# print (test_features)
predictions = predict(ml_tuner_predictor, test_features.to_numpy(), verbose=False)

# print predictions
print(predictions)

# calculate confusion matrix
pd.crosstab(index=test_labels, columns=np.round(predictions), rownames=["actual"],colnames=["predictions"])

b'0.4420003294944763'
[0.44200033 0.5        0.5        0.44200033 0.44200033 0.44200033
 0.5        0.5        0.5        0.44200033 0.44200033 0.44200033
 0.5        0.44200033 0.44200033 0.44200033 0.5        0.44200033
 0.5        0.5        0.44200033 0.44200033 0.44200033 0.5
 0.5        0.5        0.5        0.44200033 0.44200033 0.44200033
 0.44200033 0.5        0.5        0.44200033 0.44200033 0.5
 0.44200033 0.5        0.44200033 0.44200033 0.44200033 0.44200033
 0.5        0.44200033 0.44200033 0.44200033 0.5        0.44200033
 0.44200033 0.44200033 0.44200033 0.44200033 0.5        0.44200033
 0.44200033 0.44200033 0.44200033 0.5        0.44200033 0.44200033]


predictions,0.0
actual,Unnamed: 1_level_1
0,43
1,17


In [34]:
# cell calculate classification metrics

from sklearn.metrics import precision_score, recall_score, f1_score
print('Precision = {}'.format(precision_score(test_labels, predictions, average='weighted', zero_division=1)))
print('Recall = {}'.format(recall_score(test_labels, predictions, average='weighted', zero_division=1)))
print('F1 = {}'.format(f1_score(test_labels, predictions, average='weighted', zero_division=1)))


ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [None]:
# cell zzz ... cleanup
# delete endpoint
# delete model
# delete s3 bucket