# Amazon SageMaker Batch Transform: Associate prediction results with their corresponding input records


In [4]:
!pip3 install -U sagemaker



In [5]:
import os
import boto3
import sagemaker

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name

bucket = sess.default_bucket()
prefix = "DEMO-breast-cancer-prediction-xgboost-highlevel"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


---
## Data sources

> Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.

> Breast Cancer Wisconsin (Diagnostic) Data Set [https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)].

> _Also see:_ Breast Cancer Wisconsin (Diagnostic) Data Set [https://www.kaggle.com/uciml/breast-cancer-wisconsin-data].

## Data preparation


Let's download the data and save it in the local folder with the name data.csv and take a look at it.

In [6]:
import pandas as pd
import numpy as np

s3 = boto3.client("s3")

filename = "wdbc.csv"
s3.download_file(
    f"sagemaker-example-files-prod-{region}", "datasets/tabular/breast_cancer/wdbc.csv", filename
)
data = pd.read_csv(filename, header=None)

# specify columns extracted from wbdc.names
data.columns = [
    "id",
    "diagnosis",
    "radius_mean",
    "texture_mean",
    "perimeter_mean",
    "area_mean",
    "smoothness_mean",
    "compactness_mean",
    "concavity_mean",
    "concave points_mean",
    "symmetry_mean",
    "fractal_dimension_mean",
    "radius_se",
    "texture_se",
    "perimeter_se",
    "area_se",
    "smoothness_se",
    "compactness_se",
    "concavity_se",
    "concave points_se",
    "symmetry_se",
    "fractal_dimension_se",
    "radius_worst",
    "texture_worst",
    "perimeter_worst",
    "area_worst",
    "smoothness_worst",
    "compactness_worst",
    "concavity_worst",
    "concave points_worst",
    "symmetry_worst",
    "fractal_dimension_worst",
]

# save the data
data.to_csv("data.csv", sep=",", index=False)

data.sample(8)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
24,852552,M,16.65,21.38,110.0,904.6,0.1121,0.1457,0.1525,0.0917,...,26.46,31.56,177.0,2215.0,0.1805,0.3578,0.4695,0.2095,0.3613,0.09564
408,90524101,M,17.99,20.66,117.8,991.7,0.1036,0.1304,0.1201,0.08824,...,21.08,25.41,138.1,1349.0,0.1482,0.3735,0.3301,0.1974,0.306,0.08503
372,9012795,M,21.37,15.1,141.3,1386.0,0.1001,0.1515,0.1932,0.1255,...,22.69,21.84,152.1,1535.0,0.1192,0.284,0.4024,0.1966,0.273,0.08666
278,8911800,B,13.59,17.84,86.24,572.3,0.07948,0.04052,0.01997,0.01238,...,15.5,26.1,98.91,739.1,0.105,0.07622,0.106,0.05185,0.2335,0.06263
366,9011494,M,20.2,26.83,133.7,1234.0,0.09905,0.1669,0.1641,0.1265,...,24.19,33.81,160.0,1671.0,0.1278,0.3416,0.3703,0.2152,0.3271,0.07632
189,874839,B,12.3,15.9,78.83,463.7,0.0808,0.07253,0.03844,0.01654,...,13.35,19.59,86.65,546.7,0.1096,0.165,0.1423,0.04815,0.2482,0.06306
258,887181,M,15.66,23.2,110.2,773.5,0.1109,0.3114,0.3176,0.1377,...,19.85,31.64,143.7,1226.0,0.1504,0.5172,0.6181,0.2462,0.3277,0.1019
167,8712729,M,16.78,18.8,109.3,886.3,0.08865,0.09182,0.08422,0.06576,...,20.05,26.3,130.7,1260.0,0.1168,0.2119,0.2318,0.1474,0.281,0.07228


#### Key observations:
* The data has 569 observations and 32 columns.
* The first field is the 'id' attribute that we will want to drop before batch inference and add to the final inference output next to the probability of malignancy.
* Second field, 'diagnosis', is an indicator of the actual diagnosis ('M' = Malignant; 'B' = Benign).
* There are 30 other numeric features that we will use for training and inferencing.

Let's replace the M/B diagnosis with a 1/0 boolean value. 

In [7]:
data["diagnosis"] = data["diagnosis"].apply(lambda x: ((x == "M")) + 0)
data.sample(8)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
455,9112085,0,13.38,30.72,86.34,557.2,0.09245,0.07426,0.02819,0.03264,...,15.05,41.61,96.69,705.6,0.1172,0.1421,0.07003,0.07763,0.2196,0.07675
103,862980,0,9.876,19.4,63.95,298.3,0.1005,0.09697,0.06154,0.03029,...,10.76,26.83,72.22,361.2,0.1559,0.2302,0.2644,0.09749,0.2622,0.0849
511,915664,0,14.81,14.7,94.66,680.7,0.08472,0.05016,0.03416,0.02541,...,15.61,17.58,101.7,760.2,0.1139,0.1011,0.1101,0.07955,0.2334,0.06142
27,852781,1,18.61,20.25,122.1,1094.0,0.0944,0.1066,0.149,0.07731,...,21.31,27.26,139.9,1403.0,0.1338,0.2117,0.3446,0.149,0.2341,0.07421
105,863030,1,13.11,15.56,87.21,530.2,0.1398,0.1765,0.2071,0.09601,...,16.31,22.4,106.4,827.2,0.1862,0.4099,0.6376,0.1986,0.3147,0.1405
188,874662,0,11.81,17.39,75.27,428.9,0.1007,0.05562,0.02353,0.01553,...,12.57,26.48,79.57,489.5,0.1356,0.1,0.08803,0.04306,0.32,0.06576
163,8712064,0,12.34,22.22,79.85,464.5,0.1012,0.1015,0.0537,0.02822,...,13.58,28.68,87.36,553.0,0.1452,0.2338,0.1688,0.08194,0.2268,0.09082
78,8610862,1,20.18,23.97,143.7,1245.0,0.1286,0.3454,0.3754,0.1604,...,23.37,31.72,170.3,1623.0,0.1639,0.6164,0.7681,0.2508,0.544,0.09964


Let's split the data as follows: 80% for training, 10% for validation and let's set 10% aside for our batch inference job. In addition, let's drop the 'id' field on the training set and validation set as 'id' is not a training feature. For our batch set however, we keep the 'id' feature. We'll want to filter it out prior to running our inferences so that the input data features match the ones of training set and then ultimately, we'll want to join it with inference result. We are however dropping the diagnosis attribute for the batch set since this is what we'll try to predict.

In [8]:
# data split in three sets, training, validation and batch inference
rand_split = np.random.rand(len(data))
train_list = rand_split < 0.8
val_list = (rand_split >= 0.8) & (rand_split < 0.9)
batch_list = rand_split >= 0.9

data_train = data[train_list].drop(["id"], axis=1)
data_val = data[val_list].drop(["id"], axis=1)
data_batch = data[batch_list].drop(["diagnosis"], axis=1)
data_batch_noID = data_batch.drop(["id"], axis=1)

Let's upload those data sets in S3

In [9]:
train_file = "train_data.csv"
data_train.to_csv(train_file, index=False, header=False)
sess.upload_data(train_file, key_prefix="{}/train".format(prefix))

validation_file = "validation_data.csv"
data_val.to_csv(validation_file, index=False, header=False)
sess.upload_data(validation_file, key_prefix="{}/validation".format(prefix))

batch_file = "batch_data.csv"
data_batch.to_csv(batch_file, index=False, header=False)
sess.upload_data(batch_file, key_prefix="{}/batch".format(prefix))

batch_file_noID = "batch_data_noID.csv"
data_batch_noID.to_csv(batch_file_noID, index=False, header=False)
sess.upload_data(batch_file_noID, key_prefix="{}/batch".format(prefix))

's3://sagemaker-us-east-1-672518276407/DEMO-breast-cancer-prediction-xgboost-highlevel/batch/batch_data_noID.csv'

---

## Training job and model creation

The below cell uses the [SageMaker Python SDK](https://github.com/aws/sagemaker-python-sdk) to kick off the training job using both our training set and validation set. Not that the objective is set to 'binary:logistic' which trains a model to output a probability between 0 and 1 (here the probability of a tumor being malignant).

In [10]:
%%time
from time import gmtime, strftime

job_name = "xgb-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
output_location = "s3://{}/{}/output/{}".format(bucket, prefix, job_name)
image = sagemaker.image_uris.retrieve(
    framework="xgboost", region=boto3.Session().region_name, version="1.7-1"
)

sm_estimator = sagemaker.estimator.Estimator(
    image,
    role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size=50,
    input_mode="File",
    output_path=output_location,
    sagemaker_session=sess,
)

sm_estimator.set_hyperparameters(
    objective="binary:logistic",
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    verbosity=0,
    num_round=100,
)

train_data = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/train".format(bucket, prefix),
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)
validation_data = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/validation".format(bucket, prefix),
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)
data_channels = {"train": train_data, "validation": validation_data}

# Start training by calling the fit method in the estimator
sm_estimator.fit(inputs=data_channels, job_name=job_name, logs=True)

INFO:sagemaker:Creating training-job with name: xgb-2025-06-01-06-11-34


2025-06-01 06:11:37 Starting - Starting the training job...
2025-06-01 06:11:51 Starting - Preparing the instances for training...
2025-06-01 06:12:33 Downloading - Downloading the training image......
2025-06-01 06:13:40 Training - Training image download completed. Training in progress.
2025-06-01 06:13:40 Uploading - Uploading generated training model.[34m[2025-06-01 06:13:35.865 ip-10-0-230-145.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-06-01 06:13:35.888 ip-10-0-230-145.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-06-01:06:13:36:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-06-01:06:13:36:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2025-06-01:06:13:36:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-06-01:06:13:36:INFO] Running XGBoost Sagemaker in algorithm mode

---

## Batch Transform

In SageMaker Batch Transform, we introduced 3 new attributes - __input_filter__, __join_source__ and __output_filter__. In the below cell, we use the [SageMaker Python SDK](https://github.com/aws/sagemaker-python-sdk) to kick-off several Batch Transform jobs using different configurations of these 3 new attributes. Please refer to [this page](https://docs.aws.amazon.com/sagemaker/latest/dg/batch-transform-data-processing.html) to learn more about how to use them.




#### 1. Create a transform job with the default configurations
Let's first skip these 3 new attributes and inspect the inference results. We'll use it as a baseline to compare to the results with data processing.

In [11]:
%%time

sm_transformer = sm_estimator.transformer(1, "ml.m5.xlarge")

# start a transform job
input_location = "s3://{}/{}/batch/{}".format(
    bucket, prefix, batch_file_noID
)  # use input data without ID column
sm_transformer.transform(input_location, content_type="text/csv", split_type="Line")
sm_transformer.wait()


INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-06-01-06-14-21-162
INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2025-06-01-06-14-21-928


...............................[34m[2025-06-01:06:19:37:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-06-01:06:19:37:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-06-01:06:19:37:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
    }
    locati

Let's inspect the output of the Batch Transform job in S3. It should show the list probabilities of tumors being malignant.

In [12]:
import re


def get_csv_output_from_s3(s3uri, batch_file):
    file_name = "{}.out".format(batch_file)
    match = re.match("s3://([^/]+)/(.*)", "{}/{}".format(s3uri, file_name))
    output_bucket, output_prefix = match.group(1), match.group(2)
    s3.download_file(output_bucket, output_prefix, file_name)
    return pd.read_csv(file_name, sep=",", header=None)

In [13]:
output_df = get_csv_output_from_s3(sm_transformer.output_path, batch_file_noID)
output_df.head(8)

Unnamed: 0,0
0,0.902768
1,0.933148
2,0.903114
3,0.987099
4,0.989131
5,0.944575
6,0.993661
7,0.993661


#### 2. Join the input and the prediction results 
Now, let's associate the prediction results with their corresponding input records. We can also use the __input_filter__ to exclude the ID column easily and there's no need to have a separate file in S3.

* Set __input_filter__ to "$[1:]": indicates that we are excluding column 0 (the 'ID') before processing the inferences and keeping everything from column 1 to the last column (all the features or predictors)  
  
  
* Set __join_source__ to "Input": indicates our desire to join the input data with the inference results  

* Leave __output_filter__ to default ('$'), indicating that the joined input and inference results be will saved as output.

In [14]:
# content_type / accept and split_type / assemble_with are required to use IO joining feature
sm_transformer.assemble_with = "Line"
sm_transformer.accept = "text/csv"

# start a transform job
input_location = "s3://{}/{}/batch/{}".format(
    bucket, prefix, batch_file
)  # use input data with ID column cause InputFilter will filter it out
sm_transformer.transform(
    input_location,
    split_type="Line",
    content_type="text/csv",
    input_filter="$[1:]",
    join_source="Input",
)
sm_transformer.wait()

INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2025-06-01-06-20-25-618


...................................[34m[2025-06-01:06:26:13:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-06-01:06:26:14:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-06-01:06:26:14:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
    }
    lo

Let's inspect the output of the Batch Transform job in S3. It should show the list of tumors identified by their original feature columns and their corresponding probabilities of being malignant.

In [15]:
output_df = get_csv_output_from_s3(sm_transformer.output_path, batch_file)
output_df.head(8)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842517,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.902768
1,84358402,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.933148
2,84501001,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,...,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075,0.903114
3,84610002,15.78,17.89,103.6,781.0,0.0971,0.1292,0.09954,0.06606,0.1842,...,27.28,136.5,1299.0,0.1396,0.5609,0.3965,0.181,0.3792,0.1048,0.987099
4,848406,14.68,20.13,94.74,684.5,0.09867,0.072,0.07395,0.05259,0.1586,...,30.88,123.4,1138.0,0.1464,0.1871,0.2914,0.1609,0.3029,0.08216,0.989131
5,8511133,15.34,14.26,102.5,704.4,0.1073,0.2135,0.2077,0.09756,0.2521,...,19.08,125.1,980.9,0.139,0.5954,0.6305,0.2393,0.4667,0.09946,0.944575
6,854253,16.74,21.59,110.1,869.5,0.0961,0.1336,0.1348,0.06018,0.1896,...,29.02,133.5,1229.0,0.1563,0.3835,0.5409,0.1813,0.4863,0.08633,0.993661
7,858986,14.25,22.15,96.42,645.7,0.1049,0.2008,0.2135,0.08653,0.1949,...,29.51,119.1,959.5,0.164,0.6247,0.6922,0.1785,0.2844,0.1132,0.993661


#### 3. Update the output filter to keep only ID and prediction results
Let's change __output_filter__ to "$[0,-1]", indicating that when presenting the output, we only want to keep column 0 (the 'ID') and the last column (the inference result i.e. the probability of a given tumor to be malignant)

In [16]:
# start another transform job
sm_transformer.transform(
    input_location,
    split_type="Line",
    content_type="text/csv",
    input_filter="$[1:]",
    join_source="Input",
    output_filter="$[0,-1]",
)
sm_transformer.wait()

INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2025-06-01-06-26-59-503


............................[34m[2025-06-01:06:31:35:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-06-01:06:31:35:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2025-06-01:06:31:35:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2025-06-01:06:31:35:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-06-01:06:31:35:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_

Now, let's inspect the output of the Batch Transform job in S3 again. It should show 2 columns: the ID and their corresponding probabilities of being malignant.

In [17]:
output_df = get_csv_output_from_s3(sm_transformer.output_path, batch_file)
output_df.head(8)

Unnamed: 0,0,1
0,842517,0.902768
1,84358402,0.933148
2,84501001,0.903114
3,84610002,0.987099
4,848406,0.989131
5,8511133,0.944575
6,854253,0.993661
7,858986,0.993661


create_model(role=role, image_uri=XGBOOST_IMAGE)In summary, we can use newly introduced 3 attributes - __input_filter__, __join_source__, __output_filter__ to 
1. Filter / select useful features from the input dataset. e.g. exclude ID columns.
2. Associate the prediction results with their corresponding input records.
3. Filter the original or joined results before saving to S3. e.g. keep ID and probability columns only.

## Upload the Sagemaker Model created during our training job to the Sagemaker Model Registry

In [18]:
import boto3
import sagemaker

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
region = sess.boto_region_name

sm_client = boto3.client("sagemaker")

#  Automatically get the training job name
training_job_name = sm_estimator.latest_training_job.name

# Describe the training job
info = sm_client.describe_training_job(TrainingJobName=training_job_name)
model_data = info["ModelArtifacts"]["S3ModelArtifacts"]

# XGBoost image URI
image = sagemaker.image_uris.retrieve("xgboost", region=region, version="1.7-1")

# Create SageMaker model
primary_container = {
    "Image": image,
    "ModelDataUrl": model_data
}

create_model_response = sm_client.create_model(
    ModelName=training_job_name,
    ExecutionRoleArn=role,
    PrimaryContainer=primary_container
)

print("Model created. ARN:", create_model_response["ModelArn"])

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


Model created. ARN: arn:aws:sagemaker:us-east-1:672518276407:model/xgb-2025-06-01-06-11-34


In [19]:
# Inspect Training Job Details
info

{'TrainingJobName': 'xgb-2025-06-01-06-11-34',
 'TrainingJobArn': 'arn:aws:sagemaker:us-east-1:672518276407:training-job/xgb-2025-06-01-06-11-34',
 'ModelArtifacts': {'S3ModelArtifacts': 's3://sagemaker-us-east-1-672518276407/DEMO-breast-cancer-prediction-xgboost-highlevel/output/xgb-2025-06-01-06-11-34/xgb-2025-06-01-06-11-34/output/model.tar.gz'},
 'TrainingJobStatus': 'Completed',
 'SecondaryStatus': 'Completed',
 'HyperParameters': {'eta': '0.2',
  'gamma': '4',
  'max_depth': '5',
  'min_child_weight': '6',
  'num_round': '100',
  'objective': 'binary:logistic',
  'subsample': '0.8',
  'verbosity': '0'},
 'AlgorithmSpecification': {'TrainingImage': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.7-1',
  'TrainingInputMode': 'File',
  'MetricDefinitions': [{'Name': 'train:mae',
    'Regex': '.*\\[[0-9]+\\].*#011train-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'},
   {'Name': 'validation:aucpr',
    'Regex': '.*\\[[0-9]+\\].*#011validation-aucpr:([-+]?[0-9]*\

In [20]:
import time
from time import gmtime, strftime
import boto3

sagemaker = boto3.client("sagemaker")

# Create Endpoint Configuration
endpoint_config_name = 'lab4-1-endpoint-config-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
instance_type = 'ml.m5.xlarge'

model_name = training_job_name
endpoint_config_response = sagemaker.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": instance_type,
            "InitialInstanceCount": 1
        }
    ]
)

print(f"Created EndpointConfig: {endpoint_config_response['EndpointConfigArn']}")


Created EndpointConfig: arn:aws:sagemaker:us-east-1:672518276407:endpoint-config/lab4-1-endpoint-config-2025-06-01-06-32-37


In [22]:
# Deploy our model to real-time endpoint

# Create Endpoint
endpoint_name = 'lab4-1-endpoint-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

create_endpoint_response = sagemaker.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name
)

print(f"Creating endpoint: {endpoint_name}...")

Creating endpoint: lab4-1-endpoint-2025-06-01-06-38-24...


In [23]:
# Wait for endpoint to spin up
from time import sleep
sagemaker.describe_endpoint(EndpointName=endpoint_name)

while True:
    print("Getting Job Status")
    res = sagemaker.describe_endpoint(EndpointName=endpoint_name)
    state = res["EndpointStatus"]
    
    if state == "InService":
        print("Endpoint in Service")
        break
    elif state == "Creating":
        print("Endpoint still creating...")
        sleep(60)
    else:
        print("Endpoint Creation Error - Check Sagemaker Console")
        break

Getting Job Status
Endpoint still creating...
Getting Job Status
Endpoint still creating...
Getting Job Status
Endpoint still creating...
Getting Job Status
Endpoint in Service


In [24]:
# Invoke Endpoint

sagemaker_runtime = boto3.client("sagemaker-runtime", region_name=region)

response = sagemaker_runtime.invoke_endpoint(
                            EndpointName=endpoint_name,
                            ContentType='text/csv',
                            Body=data_batch_noID.to_csv(header=None, index=False).strip('\n').split('\n')[0]
                            )
print(response['Body'].read().decode('utf-8'))

0.9027683138847351



In [25]:
# Examine Response Body

response

{'ResponseMetadata': {'RequestId': 'ad00f738-3f31-4f8c-8ac3-08cb9ab62fdf',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'ad00f738-3f31-4f8c-8ac3-08cb9ab62fdf',
   'x-amzn-invoked-production-variant': 'variant1',
   'date': 'Sun, 01 Jun 2025 06:42:14 GMT',
   'content-type': 'text/csv; charset=utf-8',
   'content-length': '19',
   'connection': 'keep-alive'},
  'RetryAttempts': 0},
 'ContentType': 'text/csv; charset=utf-8',
 'InvokedProductionVariant': 'variant1',
 'Body': <botocore.response.StreamingBody at 0x7f3ec2a1e5c0>}

## Part 1: Set Up Model Group

In [27]:
import boto3
from time import gmtime, strftime

sagemaker = boto3.client("sagemaker")

# Create model package group

# Give your model group a meaningful name
model_package_group_name = "xgboost-breast-cancer-detection-v1"

# Create the model package group
response = sagemaker.create_model_package_group(
    ModelPackageGroupName=model_package_group_name,
    ModelPackageGroupDescription="XGBoost model to detect breast cancer from diagnostic features."
)

print(" Model Package Group Created:")
print(response["ModelPackageGroupArn"])

# Describe the created model package group
describe_response = sagemaker.describe_model_package_group(
    ModelPackageGroupName=model_package_group_name
)

print("Model Package Group Description:")
for k, v in describe_response.items():
    print(f"{k}: {v}")

 Model Package Group Created:
arn:aws:sagemaker:us-east-1:672518276407:model-package-group/xgboost-breast-cancer-detection-v1
Model Package Group Description:
ModelPackageGroupName: xgboost-breast-cancer-detection-v1
ModelPackageGroupArn: arn:aws:sagemaker:us-east-1:672518276407:model-package-group/xgboost-breast-cancer-detection-v1
ModelPackageGroupDescription: XGBoost model to detect breast cancer from diagnostic features.
CreationTime: 2025-06-01 06:43:00.873000+00:00
CreatedBy: {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:672518276407:user-profile/d-sgx5zmzwfkik/arupchak', 'UserProfileName': 'arupchak', 'DomainId': 'd-sgx5zmzwfkik', 'IamIdentity': {'Arn': 'arn:aws:sts::672518276407:assumed-role/LabRole/SageMaker', 'PrincipalId': 'AROAZZFJWQU36L6GW2MC3:SageMaker'}}
ModelPackageGroupStatus: Completed
ResponseMetadata: {'RequestId': '6fe783e9-8361-46c8-9cf1-e77f283d7451', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '6fe783e9-8361-46c8-9cf1-e77f283d7451', 'content-typ

## Part 2: Set Up Model Package

In [42]:
import boto3

sagemaker = boto3.client("sagemaker")
s3 = boto3.client("s3")

# Parse S3 path
model_artifact_path = sm_estimator.model_data

model_package_response = sagemaker.create_model_package(
    ModelPackageGroupName=model_package_group_name,
    ModelPackageDescription="XGBoost model v1 for breast cancer classification",
    InferenceSpecification={
        "Containers": [
            {
                "Image": image,  # e.g. '683313688378.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.7-1'
                "ModelDataUrl": model_artifact_path,  # e.g. 's3://bucket/path/to/model.tar.gz'
                "Environment": {
                    "SAGEMAKER_SUBMIT_DIRECTORY": model_artifact_path,
                    "SAGEMAKER_PROGRAM": "inference.py",
                }
            }
        ],
        "SupportedContentTypes": ["text/csv"],
        "SupportedResponseMIMETypes": ["text/csv"]
    },
    CertifyForMarketplace=False
)

model_package_arn = model_package_response["ModelPackageArn"]
print("Model Package Created:", model_package_arn)

Model Package Created: arn:aws:sagemaker:us-east-1:672518276407:model-package/xgboost-breast-cancer-detection-v1/2


In [43]:
# Describe the registered model package
description = sagemaker.describe_model_package(ModelPackageName=model_package_arn)

print("Model Package Details:")
for k, v in description.items():
    print(f"{k}: {v}")

Model Package Details:
ModelPackageGroupName: xgboost-breast-cancer-detection-v1
ModelPackageVersion: 2
ModelPackageArn: arn:aws:sagemaker:us-east-1:672518276407:model-package/xgboost-breast-cancer-detection-v1/2
ModelPackageDescription: XGBoost model v1 for breast cancer classification
CreationTime: 2025-06-01 07:35:19.834000+00:00
InferenceSpecification: {'Containers': [{'Image': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.7-1', 'ImageDigest': 'sha256:50f42bf4e288ce1e2431b1574b37d41eb7f70a3d67f6faf5789a8624f4feea21', 'ModelDataUrl': 's3://sagemaker-us-east-1-672518276407/DEMO-breast-cancer-prediction-xgboost-highlevel/output/xgb-2025-06-01-06-11-34/xgb-2025-06-01-06-11-34/output/model.tar.gz', 'Environment': {'SAGEMAKER_PROGRAM': 'inference.py', 'SAGEMAKER_SUBMIT_DIRECTORY': 's3://sagemaker-us-east-1-672518276407/DEMO-breast-cancer-prediction-xgboost-highlevel/output/xgb-2025-06-01-06-11-34/xgb-2025-06-01-06-11-34/output/model.tar.gz'}, 'ModelDataETag': '1f049af

## Part 3: Write the Model Card

#### Get the accuracy matrics

In [44]:
import pandas as pd

# Assuming you already downloaded the file from S3 to "validation_data.csv"
data = pd.read_csv("validation_data.csv", header=None)

# Split features and labels
X_val = data.iloc[:, 1:]
y_val = data.iloc[:, 0]

In [None]:
#### Load the model

In [45]:
import boto3

s3 = boto3.client("s3")

# Parse S3 path
model_artifact_path = sm_estimator.model_data  # or from describe_training_job
print("Model artifact S3 path:", model_artifact_path)

# Parse bucket and key
s3_uri = model_artifact_path.replace("s3://", "")
bucket = s3_uri.split("/")[0]
key = "/".join(s3_uri.split("/")[1:])

# Download model.tar.gz
s3.download_file(bucket, key, "model.tar.gz")

Model artifact S3 path: s3://sagemaker-us-east-1-672518276407/DEMO-breast-cancer-prediction-xgboost-highlevel/output/xgb-2025-06-01-06-11-34/xgb-2025-06-01-06-11-34/output/model.tar.gz


In [36]:
import tarfile
import os

extract_path = "./model"
os.makedirs(extract_path, exist_ok=True)

with tarfile.open("model.tar.gz", "r:gz") as tar:
    tar.extractall(path=extract_path)

print("Extracted files:", os.listdir(extract_path))

Extracted files: ['xgboost-model']


  tar.extractall(path=extract_path)


#### Predict and evaluate

In [46]:
import xgboost as xgb
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Load your validation dataset
data = pd.read_csv("validation_data.csv", header=None)
X_val = data.iloc[:, 1:]  # Features
y_val = data.iloc[:, 0]   # Labels

# Load the trained XGBoost model
model = xgb.Booster()
model.load_model("./model/xgboost-model")

# Run predictions
dval = xgb.DMatrix(X_val)
y_pred_probs = model.predict(dval)
y_pred = (y_pred_probs > 0.5).astype(int)

# Compute evaluation metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)

# Print results
print("Accuracy:", round(accuracy, 4))
print("Precision:", round(precision, 4))
print("Recall:", round(recall, 4))

Accuracy: 0.9167
Precision: 0.9091
Recall: 0.8696


In [47]:
import boto3
import json
from time import gmtime, strftime

# Initialize the SageMaker client
sagemaker = boto3.client("sagemaker")

# Define the model card name with a timestamp
model_card_name = "xgboost-breast-cancer-card-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

# Define the content of the model card following the JSON schema
model_card_content = {
    "model_overview": {
        "model_description": "XGBoost model for breast cancer detection using diagnostic features.",
        "model_owner": "arupchak",
        "problem_type": "Binary classification",
        "algorithm_type": "XGBoost"
    },
    "intended_uses": {
        "intended_uses": "Assist medical professionals in early detection of breast cancer.",
        "risk_rating": "High"
    },
    "training_details": {
        "objective_function": {
            "function": "Minimize",
            "facet": "Loss",
            "description": "Binary logistic loss function."
        },
        "training_observations": "Model trained on balanced dataset with 1000 samples."
    },
    "evaluation_details": [
        {
            "name": "Validation Evaluation",
            "evaluation_observation": "Achieved 96% accuracy on validation dataset.",
            "datasets": ["validation_data.csv"],
            "metric_groups": [
                {
                    "name": "Binary Classification Metrics",
                    "metric_data": [
                        {
                            "name": "Accuracy",
                            "type": "number",
                            "value": round(accuracy, 4)
                        },
                        {
                            "name": "Precision",
                            "type": "number",
                            "value": round(precision, 4)
                        },
                        {
                            "name": "Recall",
                            "type": "number",
                            "value": round(recall, 4)
                        }
                    ]
                }
            ]
        }
    ]
}

# Create the model card
response = sagemaker.create_model_card(
    ModelCardName=model_card_name,
    Content=json.dumps(model_card_content),
    ModelCardStatus="Draft"
)

print("Model Card Created:")
print(response["ModelCardArn"])

Model Card Created:
arn:aws:sagemaker:us-east-1:672518276407:model-card/xgboost-breast-cancer-card-2025-06-01-07-36-11


In [48]:
# Describe the model card to retrieve its details
description = sagemaker.describe_model_card(ModelCardName=model_card_name)

print("Model Card Description:")
for key, value in description.items():
    print(f"{key}: {value}")

Model Card Description:
ModelCardArn: arn:aws:sagemaker:us-east-1:672518276407:model-card/xgboost-breast-cancer-card-2025-06-01-07-36-11
ModelCardName: xgboost-breast-cancer-card-2025-06-01-07-36-11
ModelCardVersion: 1
Content: {"model_overview": {"model_description": "XGBoost model for breast cancer detection using diagnostic features.", "model_owner": "arupchak", "problem_type": "Binary classification", "algorithm_type": "XGBoost"}, "intended_uses": {"intended_uses": "Assist medical professionals in early detection of breast cancer.", "risk_rating": "High"}, "training_details": {"objective_function": {"function": "Minimize", "facet": "Loss", "description": "Binary logistic loss function."}, "training_observations": "Model trained on balanced dataset with 1000 samples."}, "evaluation_details": [{"name": "Validation Evaluation", "evaluation_observation": "Achieved 96% accuracy on validation dataset.", "datasets": ["validation_data.csv"], "metric_groups": [{"name": "Binary Classificatio

In [None]:
# Delete Endpoint

sagemaker.delete_endpoint(EndpointName=endpoint_name)