In [1]:
import os
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sagemaker.predictor import Predictor
import json
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
# Set up SageMaker session and role
session = Session()
role = get_execution_role()
region = session.boto_region_name

In [3]:
# Get the built-in XGBoost container URI
container = sagemaker.image_uris.retrieve("xgboost", region, version="1.5-1")

# Create local directory for data
os.makedirs("data", exist_ok=True)

In [4]:

# Load and prepare a smaller subset of the Iris dataset (50 samples)
iris = load_iris()
X, y = iris.data, iris.target
subset_indices = np.random.choice(len(X), size=50, replace=False)  # Random 50 samples
X, y = X[subset_indices], y[subset_indices]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Prepare DataFrames in XGBoost format (target first)
train_data = pd.DataFrame(np.c_[y_train, X_train])  # Target column first
test_data = pd.DataFrame(np.c_[y_test, X_test])  # Target column first

train_path = "data/train.csv"
test_path = "data/test.csv"
train_data.to_csv(train_path, index=False, header=False)
test_data.to_csv(test_path, index=False, header=False)

In [6]:
# **Upload datasets to S3**
s3_bucket = session.default_bucket()  # Use default SageMaker S3 bucket
s3_prefix = "sagemaker-xgboost-iris-small"
train_s3_path = session.upload_data(train_path, bucket=s3_bucket, key_prefix=s3_prefix)
test_s3_path = session.upload_data(test_path, bucket=s3_bucket, key_prefix=s3_prefix)

In [9]:
# **Use SageMaker's Built-in XGBoost Model (No Script Needed)**
xgb = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",  # Enforced instance type
    output_path=f"s3://{s3_bucket}/{s3_prefix}/model",
    sagemaker_session=session,
    hyperparameters={
        "max_depth": 3,  # Smaller tree depth
        "eta": 0.1,  # Lower learning rate
        "gamma": 2,  # Less pruning
        "min_child_weight": 3,
        "subsample": 0.8,
        "objective": "multi:softprob",
        "num_class": 3,
        "num_round": 8,  # Fewer training iterations
    },
)

In [10]:
# **Train the model using TrainingInput**
train_input = TrainingInput(train_s3_path, content_type="csv")
test_input = TrainingInput(test_s3_path, content_type="csv")
xgb.fit({"train": train_input, "validation": test_input})


INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-02-23-22-12-13-299


2025-02-23 22:12:14 Starting - Starting the training job...
2025-02-23 22:12:28 Starting - Preparing the instances for training...
2025-02-23 22:12:50 Downloading - Downloading input data...
2025-02-23 22:13:36 Downloading - Downloading the training image......
2025-02-23 22:14:42 Training - Training image download completed. Training in progress.
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-02-23 22:14:37.234 ip-10-2-192-205.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-02-23 22:14:37.266 ip-10-2-192-205.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-02-23:22:14:37:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-02-23:22:14:37:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.[0m
[34mReturning the value itself[0m
[34m[2025-02-23:22:14:37:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-02-23:22:14:37:INFO] Runn

In [11]:
# **Deploy the trained model **
xgb_predictor = xgb.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",  # Make sure the instance type is available at your region.
    serializer=CSVSerializer(),
    deserializer=JSONDeserializer(),
)




INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-02-23-22-15-37-212
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2025-02-23-22-15-37-212
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2025-02-23-22-15-37-212


------!

In [12]:


# Assuming the model is already deployed
predictor = Predictor(endpoint_name=xgb_predictor.endpoint_name)

# Convert the test data to CSV (no headers, no index)
X_test_df = pd.DataFrame(X_test)  # Assuming X_test is your test data
payload = X_test_df.to_csv(header=False, index=False).encode("utf-8")

# Manually set the content type when calling the endpoint
response = predictor.sagemaker_session.sagemaker_runtime_client.invoke_endpoint(
    EndpointName=predictor.endpoint_name,
    ContentType='text/csv',  # specify the content type
    Accept='application/json',  # expected response type
    Body=payload
)

# Read the response
response_body = response['Body'].read().decode("utf-8")

# Print the raw response to debug
print(f"Raw Response: {response_body}")

# Assuming response is a JSON array or a dictionary with a 'predictions' key
response_dict = json.loads(response_body)
predictions = response_dict['predictions']

# Extracting the 'score' values and applying np.argmax to each
predicted_classes = [np.argmax(item['score']) for item in predictions]

print(f"Predicted Classes: {predicted_classes}")





Raw Response: {"predictions": [{"score": [0.6133274435997009, 0.19765879213809967, 0.1890137642621994]}, {"score": [0.1820436269044876, 0.2005375325679779, 0.6174188256263733]}, {"score": [0.6133274435997009, 0.19765879213809967, 0.1890137642621994]}, {"score": [0.6133274435997009, 0.19765879213809967, 0.1890137642621994]}, {"score": [0.1899353265762329, 0.5734841227531433, 0.23658053576946259]}, {"score": [0.6133274435997009, 0.19765879213809967, 0.1890137642621994]}, {"score": [0.20267529785633087, 0.22326521575450897, 0.5740594863891602]}, {"score": [0.21333828568458557, 0.564743161201477, 0.22191855311393738]}, {"score": [0.1976446956396103, 0.5967615246772766, 0.20559380948543549]}, {"score": [0.1820436269044876, 0.2005375325679779, 0.6174188256263733]}]}
Predicted Classes: [0, 2, 0, 0, 1, 0, 2, 1, 1, 2]


In [13]:
print(f"Endpoint Name: {xgb_predictor.endpoint_name}")

Endpoint Name: sagemaker-xgboost-2025-02-23-22-15-37-212


In [14]:


# Assuming you have your true labels (y_test) and predicted classes (predicted_classes) from the previous cell

# Example of true labels (y_test) – replace with your actual values
y_test = [0, 1, 2, 2, 1, 1, 0, 0, 1, 1]

# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_classes)
print(f"Accuracy: {accuracy:.4f}")

# Calculate precision, recall, F1-score
precision, recall, f1, _ = precision_recall_fscore_support(y_test, predicted_classes, average='weighted')
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Confusion matrix
conf_matrix = confusion_matrix(y_test, predicted_classes)
print(f"Confusion Matrix:\n{conf_matrix}")


Accuracy: 0.3000
Precision: 0.4083
Recall: 0.3000
F1 Score: 0.3357
Confusion Matrix:
[[1 1 1]
 [1 2 2]
 [2 0 0]]


In [None]:
# **Clean up resources**

In [15]:
# **Clean up resources**
xgb_predictor.delete_model()
xgb_predictor.delete_endpoint()

INFO:sagemaker:Deleting model with name: sagemaker-xgboost-2025-02-23-22-15-37-212
INFO:sagemaker:Deleting endpoint configuration with name: sagemaker-xgboost-2025-02-23-22-15-37-212
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2025-02-23-22-15-37-212
