In [None]:
# 1. Data Preparation
# 2. Move data into S3 bucket
# 3. Create Model 
# 4. Train the Model
# 5. Deploy the Model 

## Data Preparation

In [2]:
import urllib.request

urllib.request.urlretrieve("https://archive.ics.uci.edu/static/public/53/iris.zip", "iris.zip")


('iris.zip', <http.client.HTTPMessage at 0x7f18016fe3e0>)

In [3]:
!mkdir data
!unzip iris.zip -d data/

Archive:  iris.zip
  inflating: data/Index              
  inflating: data/bezdekIris.data    
  inflating: data/iris.data          
  inflating: data/iris.names         


In [13]:
# 1. read data
# 2. convert to numeric values
# 3. shuffle
# 4. change label column index
# 5. split {train, val sets}

import pandas as pd

data = pd.read_csv('data/iris.data', header=None)

data[4] = data[4].replace('Iris-setosa', 0)
data[4] = data[4].replace('Iris-virginica', 1)
data[4] = data[4].replace('Iris-versicolor', 2)

data = data.sample(frac=1).reset_index(drop=True)

data = data[[4, 0, 1, 2, 3]]

train_data = data[:120]

validation_data = data[120:]

## Move data into S3 Bucket

In [15]:
import boto3

bucket_name = "sagemaker-iris-sagemaker"

train_data.to_csv('train_data.csv', header=False, index=False)
key = 'data/train/train_data'
url = 's3://{}/{}'.format(bucket_name, key)
boto3.Session().resource('s3').Bucket(bucket_name).Object(key).upload_file('train_data.csv')

validation_data.to_csv('validation_data.csv', header=False, index=False)
key = 'data/validation/validation_data'
url = 's3://{}/{}'.format(bucket_name, key)
boto3.Session().resource('s3').Bucket(bucket_name).Object(key).upload_file('validation_data.csv')


## Create Model

In [20]:
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker import get_execution_role

key = 'model/xgb_model'
s3_output_location = 's3://{}/{}'.format(bucket_name, key)

xgb_model = sagemaker.estimator.Estimator(
    get_image_uri(boto3.Session().region_name, 'xgboost'),
    get_execution_role(),
    train_instance_count = 1,
    train_instance_type = 'ml.m4.xlarge',
    train_volume_size = 5,
    output_path = s3_output_location,
    sagemaker_session = sagemaker.Session()
)

xgb_model.set_hyperparameters(max_depth = 5,
                             eta = 0.2,
                             gamma = 4,
                             min_child_weight = 6,
                             silent = 0,
                             objective='multi:softmax',
                             num_class = 3,
                             num_round = 10)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


## Train Model

In [21]:
train_data = 's3://{}/{}'.format(bucket_name, 'data/train')
validation_data = 's3://{}/{}'.format(bucket_name, 'data/validation')

train_channel = sagemaker.session.s3_input(train_data, content_type = 'text/csv')
validation_channel = sagemaker.session.s3_input(validation_data, content_type = 'text/csv')

data_channels = {'train' : train_channel, 'validation' : validation_channel}

xgb_model.fit(inputs=data_channels)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Creating training-job with name: xgboost-2024-01-21-18-10-02-426


2024-01-21 18:10:02 Starting - Starting the training job......
2024-01-21 18:10:47 Starting - Preparing the instances for training.........
2024-01-21 18:12:05 Downloading - Downloading input data...
2024-01-21 18:12:35 Downloading - Downloading the training image...
2024-01-21 18:13:30 Training - Training image download completed. Training in progress...[34mArguments: train[0m
[34m[2024-01-21:18:13:41:INFO] Running standalone xgboost training.[0m
[34m[2024-01-21:18:13:41:INFO] File size need to be processed in the node: 0.0mb. Available memory size in the node: 8541.23mb[0m
[34m[2024-01-21:18:13:41:INFO] Determined delimiter of CSV input is ','[0m
[34m[18:13:41] S3DistributionType set as FullyReplicated[0m
[34m[18:13:41] 120x4 matrix with 480 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2024-01-21:18:13:41:INFO] Determined delimiter of CSV input is ','[0m
[34m[18:13:41] S3DistributionType set as FullyReplicated[0m
[34m[18:

## Deploy Model

In [22]:
xgb_predictor = xgb_model.deploy(initial_instance_count = 1,
                                instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: xgboost-2024-01-21-18-14-32-040
INFO:sagemaker:Creating endpoint-config with name xgboost-2024-01-21-18-14-32-040
INFO:sagemaker:Creating endpoint with name xgboost-2024-01-21-18-14-32-040


-----!