In [1]:
# --- SageMaker setup ---
# Import libraries Sagemaker (Container)

import boto3 # AWS SDK for Python. Used for direct S3 actions, clients, etc. (use it to upload CSVs) 
import os # filesystem / env var utilities.
import sagemaker # high-level SageMaker Python SDK. Provides Session, Estimator, helper classes and deployment tools
from sagemaker import get_execution_role # Convenience that returns the IAM role ARN when you run inside a SageMaker managed environment (Studio, notebook instance)
from sagemaker.serializers import CSVSerializer # Used by predictors to serialize inputs when calling a deployed endpoint (text/csv)
from sagemaker.inputs import TrainingInput # Helper that wraps an S3 URI and metadata (content type, input mode) for channels passed to .fit()
from sagemaker.sklearn import SKLearn # Higher-level shortcut to the Scikit-Learn estimator in the SDK

# Get the SageMaker session and the execution role from the SageMaker domain
sess = sagemaker.Session() # Creates an object that holds config (default S3 bucket, region) and helpers for uploading, describing jobs, etc.
role = get_execution_role() # An IAM role ARN that SageMaker uses to access S3, CloudWatch, ECR, etc. The role must have the right permissions (S3 write/read, SageMaker actions)

bucket = 'script-mode-xgb-demo' # Update with the name of a bucket that is already created in S3
prefix = 'demo-xgb-hyperopt' # The name of the folder that will be created in the S3 bucket

print(f"Session bucket: {bucket}")
print(f"Prefix: {prefix}")
print("✅ SageMaker session ready")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Session bucket: script-mode-xgb-demo
Prefix: demo-xgb-hyperopt
✅ SageMaker session ready


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

# load dataset
df = pd.read_csv('data/training_data.csv')
df_copy = df.copy()
df_copy.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,target
0,432.475954,289.373016,481.3156,358.755566,802.659004,176.761177,72.648102,720.969179,36.327684,83.768878,...,4.385848,516.789458,19.624422,13.16244,42.351948,35.920392,20.755984,13.8143,384.497136,14.364922
1,517.59625,330.448341,585.920055,22.684031,169.81324,335.60164,284.451476,748.101047,73.701438,358.147215,...,5.563334,2.960064,20.721878,17.740184,1.726915,167.576065,75.492679,2.480979,303.710869,19.984801
2,189.43935,553.88882,165.83379,202.465927,176.695586,321.155049,407.278389,161.245668,282.269025,221.570899,...,4.536947,581.823741,101.695639,0.653592,486.859084,117.491548,6.420465,20.713314,22.651537,12.944351
3,237.307878,195.894881,416.752252,468.729031,611.693517,301.411711,241.880655,49.597044,122.396821,13.828319,...,5.518968,45.014729,196.350455,47.638515,411.414213,67.142022,115.630943,8.927957,388.240433,14.79244
4,602.845256,16.103208,221.759979,345.765574,558.588369,276.704241,408.069566,19.390813,138.769765,146.662193,...,2.136214,133.59043,197.634584,26.278027,111.127557,172.181136,85.869642,30.537857,625.931837,11.802634


In [9]:
# --- Defining CSVs and target ---
X_full = df_copy.drop(columns='target')
y_full = df_copy['target']

# Test Split
# 80/20
X_temp, X_test, y_temp, y_test = train_test_split(X_full, y_full, test_size=0.2, random_state=12345)

# Validation Set (0.25 of 0.8 = 0.2)
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=0.25, random_state=12345)
# The test size is 25% of the training data (80%), which is 20% of the full dataset
# 60 training /20 validation / 20 test 

# --- Export to CSV files ---
train_df = pd.concat([X_train, y_train], axis=1)
valid_df = pd.concat([X_valid, y_valid], axis=1)
test_df  = pd.concat([X_test, y_test], axis=1)

train_df.to_csv("train.csv", index=False)
valid_df.to_csv("validation.csv", index=False)
test_df.to_csv("test.csv", index=False)

print("✅ Local CSVs created successfully:")
print(f" - train.csv ({train_df.shape})")
print(f" - validation.csv ({valid_df.shape})")
print(f" - test.csv ({test_df.shape})")

✅ Local CSVs created successfully:
 - train.csv ((480, 21))
 - validation.csv ((160, 21))
 - test.csv ((160, 21))


In [10]:
# --- Upload training, testing, and validation data to the S3 Sagemaker bucket ---
# Path used: s3://<bucket>/<prefix>/train/train.csv, etc.
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')

# Objects with the location of the training, testing, and validation data in the S3 provided 
# content_type is the input parsing: how the training container reads the input files once it downloads them from S3
s3_input_train = TrainingInput(
    s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv'
)
s3_input_validation = TrainingInput(
    s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv'
)
s3_input_test = TrainingInput(
    s3_data='s3://{}/{}/test/'.format(bucket, prefix), content_type='csv'
)

In [15]:
# --- Create Estimator ---
from sagemaker.sklearn.estimator import SKLearn

sklearn_estimator = SKLearn(
    entry_point='train.py', # File that SageMaker will run inside the training container (Script Mode)
    source_dir=".", # uploads everything in the same directory as your notebook (including train.py, preprocessing.py, and requirements.txt) to SageMaker’s training container.
    instance_type='ml.m5.xlarge', # Compute instance type used by the training job. This determines cost and CPU/GPU availability
    framework_version='1.2-1', # Specify the SKLearn container version. The container is prebuilt; it will run train.py inside it
    role=role, # IAM role SageMaker will assume to read S3 input, write S3 output, push logs, etc
    base_job_name='xgb-hyperopt-demo', # Prefix used when SageMaker creates the unique training job name
    py_version='py3',
    dependencies=['requirements.txt']  # Instructs the training container to pip-install packages from requirements.txt into the container before running train.py
)
# --- Launch training job ---
print("🚀 Launching training job...")
sklearn_estimator.fit({ # .fit starts all the SageMaker service
    'train': s3_input_train,
    'validation': s3_input_validation,
    'test': s3_input_test
})
print("✅ Training job launched successfully!")

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.


🚀 Launching training job...


INFO:sagemaker:Creating training-job with name: xgb-hyperopt-demo-2025-10-18-23-08-54-298


2025-10-18 23:08:56 Starting - Starting the training job...
2025-10-18 23:09:11 Starting - Preparing the instances for training...
2025-10-18 23:09:53 Downloading - Downloading the training image......
  import pkg_resources[0m
[34m2025-10-18 23:10:51,548 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2025-10-18 23:10:51,553 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-10-18 23:10:51,555 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-10-18 23:10:51,570 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2025-10-18 23:10:51,944 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt[0m
[34mCollecting xgboost==1.5.2 (from -r requirements.txt (line 1))
  Downloading xgboost-1.5.2-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)[0m
[34mCollecting hyperopt==0.2.7 (from 

In [16]:
# --- Where the model is (S3 SageMaker bucket) ---
import sagemaker

sess = sagemaker.Session()
job_name = "xgb-hyperopt-demo-2025-10-18-23-08-54-298"  # last job name
desc = sess.describe_training_job(job_name)
print("✅ Training job status:", desc["TrainingJobStatus"])
print("📦 Model S3 Path:", desc["ModelArtifacts"]["S3ModelArtifacts"])

✅ Training job status: Completed
📦 Model S3 Path: s3://sagemaker-us-east-1-059535450142/xgb-hyperopt-demo-2025-10-18-23-08-54-298/output/model.tar.gz


In [17]:
# --- What is the container where the model was trained ---
sagemaker_session = sagemaker.session.Session()
job_name = 'xgb-hyperopt-demo-2025-10-18-23-08-54-298'

training_job = sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=job_name)
print("📦 The Model Container is:",training_job['AlgorithmSpecification']['TrainingImage'])

📦 The Model Container is: 683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:1.2-1-cpu-py3


In [2]:
# --- Preparing new Dataset for Interference ---

import pandas as pd

# Read data into a new dataframe
batch_data_path = 'data/blind_test_data.csv'
df = pd.read_csv(batch_data_path, delimiter=',', index_col=None)

print(df.shape)
print(df.head())

# --- Save to CSV without headers and index ---
local_batch_file = 'batch_data_for_transform.csv'
df.to_csv(local_batch_file, header=False, index=False)

# --- Upload to S3 ---
s3 = boto3.Session().resource('s3')

bucket = 'script-mode-xgb-demo'
prefix = 'xgboost-input/batch'

s3_path = os.path.join(prefix, local_batch_file)
s3.Bucket(bucket).Object(s3_path).upload_file(local_batch_file)

print(f"✅ Uploaded to s3://{bucket}/{s3_path}")

(200, 20)
    feature_0   feature_1   feature_2   feature_3   feature_4   feature_5  \
0  676.867615   32.518822  254.825875  502.268510  609.469688  497.624266   
1  628.695228  426.163933  347.070280  431.106903  915.527507  301.699534   
2  131.765943  323.839669  245.399775  181.814398  710.179159   59.117377   
3  160.970195  489.712029   70.482159  309.486269  888.030604  412.655666   
4  419.907137  216.625219  487.887860  253.704462  323.226862   65.744463   

    feature_6   feature_7   feature_8   feature_9  feature_10  feature_11  \
0  105.246239  269.045539  150.177005  312.649860  765.296227    0.237996   
1    1.666992  306.733041  104.234252   63.242070  467.009734    6.608084   
2  312.622788  687.965027  109.803179  381.169500  700.532108    1.822370   
3  216.124989   47.415477  104.139145  326.462385  378.446187    1.686895   
4  271.811469  527.726782  129.805782  168.429679  637.944633    0.948507   

   feature_12  feature_13  feature_14  feature_15  feature_16  f

In [None]:
# --- Batch Transform Phase ---

from sagemaker.model import Model
import sagemaker

# S3 path where the model artifact was saved
model_artifact = 's3://sagemaker-us-east-1-059535450142/xgb-hyperopt-demo-2025-10-16-01-27-09-622/output/model.tar.gz'

# The same container where the model was trained
container = "683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:1.2-1-cpu-py3"

# Create the SageMaker Model object
model = Model(
    image_uri=container,
    model_data=model_artifact,
    role=role,
    sagemaker_session=sess
)

# Create the Transformer for Batch Transform
transformer = model.transformer(
    instance_count=1,
    instance_type="ml.m5.xlarge",
    strategy="SingleRecord",
    assemble_with="Line",
    output_path="s3://script-mode-xgb-demo/xgboost-predictions"
)

# Run the Batch Transform job
transformer.transform(
    data="s3://script-mode-xgb-demo/xgboost-input/batch/batch_data_for_transform.csv",
    content_type="text/csv",
    split_type="Line"
)

# Wait for job completion
transformer.wait()

In [2]:
from inference import run_batch_inference

batch_output = run_batch_inference(
    model_artifact='s3://sagemaker-us-east-1-059535450142/xgb-hyperopt-demo-2025-10-16-01-27-09-622/output/model.tar.gz',
    container='683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:1.2-1-cpu-py3',
    local_data_path="data/blind_test_data.csv",
    bucket="script-mode-xgb-demo",
    prefix="xgboost-input/batch"
)
print(batch_output)

Loaded data shape: (200, 20)
Saved preprocessed file: batch_data_for_transform.csv
✅ Uploaded to s3://script-mode-xgb-demo/xgboost-input/batch/batch_data_for_transform.csv


INFO:sagemaker:Creating transform job with name: sagemaker-scikit-learn-2025-10-19-18-57-14-544


Starting Batch Transform job...


In [3]:
import boto3

client = boto3.client("service-quotas", region_name="us-east-1")

# Check SageMaker transform instance quotas
quotas = client.list_service_quotas(ServiceCode="sagemaker")["Quotas"]

for q in quotas:
    if "transform job" in q["QuotaName"].lower():
        print(f"{q['QuotaName']}: {q['Value']}")