In [1]:
# --- SageMaker setup ---
# Import libraries Sagemaker (Container)

import boto3 # AWS SDK for Python. Used for direct S3 actions, clients, etc. (use it to upload CSVs) 
import os # filesystem / env var utilities.
import sagemaker # high-level SageMaker Python SDK. Provides Session, Estimator, helper classes and deployment tools
from sagemaker import get_execution_role # Convenience that returns the IAM role ARN when you run inside a SageMaker managed environment (Studio, notebook instance)
from sagemaker.serializers import CSVSerializer # Used by predictors to serialize inputs when calling a deployed endpoint (text/csv)
from sagemaker.inputs import TrainingInput # Helper that wraps an S3 URI and metadata (content type, input mode) for channels passed to .fit()
from sagemaker.sklearn import SKLearn # Higher-level shortcut to the Scikit-Learn estimator in the SDK

# Get the SageMaker session and the execution role from the SageMaker domain
sess = sagemaker.Session() # Creates an object that holds config (default S3 bucket, region) and helpers for uploading, describing jobs, etc.
role = get_execution_role() # An IAM role ARN that SageMaker uses to access S3, CloudWatch, ECR, etc. The role must have the right permissions (S3 write/read, SageMaker actions)

bucket = 'script-mode-xgb-demo' # Update with the name of a bucket that is already created in S3
prefix = 'demo-xgb-hyperopt' # The name of the folder that will be created in the S3 bucket

print(f"Session bucket: {bucket}")
print(f"Prefix: {prefix}")
print("‚úÖ SageMaker session ready")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Session bucket: script-mode-xgb-demo
Prefix: demo-xgb-hyperopt
‚úÖ SageMaker session ready


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# load dataset
df = pd.read_csv('data/training_data.csv')
df_copy = df.copy()
df_copy.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,target
0,432.475954,289.373016,481.3156,358.755566,802.659004,176.761177,72.648102,720.969179,36.327684,83.768878,...,4.385848,516.789458,19.624422,13.16244,42.351948,35.920392,20.755984,13.8143,384.497136,14.364922
1,517.59625,330.448341,585.920055,22.684031,169.81324,335.60164,284.451476,748.101047,73.701438,358.147215,...,5.563334,2.960064,20.721878,17.740184,1.726915,167.576065,75.492679,2.480979,303.710869,19.984801
2,189.43935,553.88882,165.83379,202.465927,176.695586,321.155049,407.278389,161.245668,282.269025,221.570899,...,4.536947,581.823741,101.695639,0.653592,486.859084,117.491548,6.420465,20.713314,22.651537,12.944351
3,237.307878,195.894881,416.752252,468.729031,611.693517,301.411711,241.880655,49.597044,122.396821,13.828319,...,5.518968,45.014729,196.350455,47.638515,411.414213,67.142022,115.630943,8.927957,388.240433,14.79244
4,602.845256,16.103208,221.759979,345.765574,558.588369,276.704241,408.069566,19.390813,138.769765,146.662193,...,2.136214,133.59043,197.634584,26.278027,111.127557,172.181136,85.869642,30.537857,625.931837,11.802634


In [3]:
# --- Feature Engineering ---
df_copy['feature_2_9_13'] = df_copy['feature_2'] * df_copy['feature_9'] * df_copy['feature_13']
df_copy['feature_9_x_13'] = df_copy['feature_9'] * df_copy['feature_13']

selected_features = [
    'feature_2_9_13',
    'feature_9_x_13',
    'feature_11',
    'feature_18',
    'feature_2',
    'target'
]
df_xgb = df_copy[selected_features]
df_xgb.head()

Unnamed: 0,feature_2_9_13,feature_9_x_13,feature_11,feature_18,feature_2,target
0,791242.3,1643.915828,4.385848,13.8143,481.3156,14.364922
1,4348396.0,7421.483062,5.563334,2.480979,585.920055,19.984801
2,3736699.0,22532.794173,4.536947,20.713314,165.83379,12.944351
3,1131564.0,2715.196816,5.518968,8.927957,416.752252,14.79244
4,6427829.0,28985.521571,2.136214,30.537857,221.759979,11.802634


In [4]:
# --- Defining features and target ---
X_full = df_xgb.drop(columns='target')
y_full = df_xgb['target']

# Test Split
# 80/20
X_temp, X_test, y_temp, y_test = train_test_split(X_full, y_full, test_size=0.2, random_state=12345)

# Validation Set (0.25 of 0.8 = 0.2)
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=0.25, random_state=12345)
# The test size is 25% of the training data (80%), which is 20% of the full dataset
# 60 training /20 validation / 20 test 

# --- Export to CSV files ---
train_df = pd.concat([X_train, y_train], axis=1)
valid_df = pd.concat([X_valid, y_valid], axis=1)
test_df  = pd.concat([X_test, y_test], axis=1)

train_df.to_csv("train.csv", index=False)
valid_df.to_csv("validation.csv", index=False)
test_df.to_csv("test.csv", index=False)

print("‚úÖ Local CSVs created successfully:")
print(f" - train.csv ({train_df.shape})")
print(f" - validation.csv ({valid_df.shape})")
print(f" - test.csv ({test_df.shape})")

‚úÖ Local CSVs created successfully:
 - train.csv ((480, 6))
 - validation.csv ((160, 6))
 - test.csv ((160, 6))


In [5]:
# --- Upload training and validation data to the S3 bucket ---
# Path used: s3://<bucket>/<prefix>/train/train.csv, etc.
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')

# Objects with the location of the training, testing and validation data in the S3 provided 
# content_type is the input parsing: how the training container reads the input files once it downloads them from S3
s3_input_train = TrainingInput(
    s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv'
)
s3_input_validation = TrainingInput(
    s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv'
)
s3_input_test = TrainingInput(
    s3_data='s3://{}/{}/test/'.format(bucket, prefix), content_type='csv'
)

In [6]:
# --- Create Estimator ---
from sagemaker.sklearn.estimator import SKLearn

sklearn_estimator = SKLearn(
    entry_point='train.py', # File that SageMaker will run inside the training container (Script Mode)
    instance_type='ml.m5.xlarge', # Compute instance type used by the training job. This determines cost and CPU/GPU availability
    framework_version='1.2-1', # Specify the SKLearn container version. The container is prebuilt; it will run train.py inside it
    role=role, # IAM role SageMaker will assume to read S3 input, write S3 output, push logs, etc
    base_job_name='xgb-hyperopt-demo', # Prefix used when SageMaker creates the unique training job name
    py_version='py3',
    dependencies=['requirements.txt']  # Instructs the training container to pip-install packages from requirements.txt into the container before running train.py

# --- Launch training job ---
print("üöÄ Launching training job...")
sklearn_estimator.fit({ # .fit starts all the SageMaker service
    'train': s3_input_train,
    'validation': s3_input_validation,
    'test': s3_input_test
})
print("‚úÖ Training job launched successfully!")

üöÄ Launching training job...


INFO:sagemaker:Creating training-job with name: xgb-hyperopt-demo-2025-10-16-01-27-09-622


2025-10-16 01:27:14 Starting - Starting the training job...
2025-10-16 01:27:28 Starting - Preparing the instances for training...
2025-10-16 01:28:11 Downloading - Downloading the training image......
  import pkg_resources[0m
[34m2025-10-16 01:29:10,419 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2025-10-16 01:29:10,422 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-10-16 01:29:10,425 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-10-16 01:29:10,439 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2025-10-16 01:29:10,693 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt[0m
[34mCollecting xgboost==1.5.2 (from -r requirements.txt (line 1))
  Downloading xgboost-1.5.2-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)[0m
[34mCollecting hyperopt==0.2.7 (from 

In [7]:
# --- Where the model is (S3 SageMaker bucket) ---

import sagemaker

sess = sagemaker.Session()
job_name = "xgb-hyperopt-demo-2025-10-16-01-27-09-622"  # last job name
desc = sess.describe_training_job(job_name)
print("‚úÖ Training job status:", desc["TrainingJobStatus"])
print("üì¶ Model S3 Path:", desc["ModelArtifacts"]["S3ModelArtifacts"])

‚úÖ Training job status: Completed
üì¶ Model S3 Path: s3://sagemaker-us-east-1-059535450142/xgb-hyperopt-demo-2025-10-16-01-27-09-622/output/model.tar.gz
