# Data Preparation

In [2]:
import urllib.request
urllib.request.urlretrieve('https://archive.ics.uci.edu/static/public/53/iris.zip', 'data.zip')

('data.zip', <http.client.HTTPMessage at 0x7f8f8fe12740>)

In [None]:
import urllib.request
urllib.request.urlretrieve('https://archive.ics.uci.edu/static/public/53/iris.zip', 'data.zip')

In [3]:
!mkdir data
!unzip data.zip -d data/

Archive:  data.zip
  inflating: data/Index              
  inflating: data/bezdekIris.data    
  inflating: data/iris.data          
  inflating: data/iris.names         


In [24]:
#read data
import pandas as pd
data = pd.read_csv('data/iris.data', header=None)
# convert to numerical data
print(data)
data[4] = data[4].replace('Iris-setosa' , 0)
data[4] = data[4].replace('Iris-virginica' , 1)
data[4] = data[4].replace('Iris-versicolor' , 2)
#print(data)
#shuffle
data = data.sample(frac=1).reset_index(drop=True)

# change label column index

data = data[[4, 0, 1, 2, 3]]
print(data)

#  split train data
# 80% train data
# 20% validation data
train_data = data[:120]
val_data = data[:120]

       0    1    2    3               4
0    5.1  3.5  1.4  0.2     Iris-setosa
1    4.9  3.0  1.4  0.2     Iris-setosa
2    4.7  3.2  1.3  0.2     Iris-setosa
3    4.6  3.1  1.5  0.2     Iris-setosa
4    5.0  3.6  1.4  0.2     Iris-setosa
..   ...  ...  ...  ...             ...
145  6.7  3.0  5.2  2.3  Iris-virginica
146  6.3  2.5  5.0  1.9  Iris-virginica
147  6.5  3.0  5.2  2.0  Iris-virginica
148  6.2  3.4  5.4  2.3  Iris-virginica
149  5.9  3.0  5.1  1.8  Iris-virginica

[150 rows x 5 columns]
     4    0    1    2    3
0    2  6.2  2.2  4.5  1.5
1    2  6.1  2.9  4.7  1.4
2    2  6.1  2.8  4.7  1.2
3    0  5.0  3.2  1.2  0.2
4    2  5.6  3.0  4.1  1.3
..  ..  ...  ...  ...  ...
145  1  6.9  3.1  5.4  2.1
146  0  5.1  3.5  1.4  0.3
147  2  5.1  2.5  3.0  1.1
148  0  5.1  3.5  1.4  0.2
149  2  6.9  3.1  4.9  1.5

[150 rows x 5 columns]


  data[4] = data[4].replace('Iris-versicolor' , 2)


# Move data to s3

In [27]:
import boto3
bucket_name =  'sagemaker-us-east-1-676887225963'

train_data.to_csv('data.csv', header=False, index=False)
key = 'data/train/data'
url = 's3://{}/{}'.format(bucket_name, key)
boto3.Session().resource('s3').Bucket(bucket_name).Object(key).upload_file('data.csv')

val_data.to_csv('data.csv', header=False, index=False)
key = 'data/val/data'
url = 's3://{}/{}'.format(bucket_name, key)
boto3.Session().resource('s3').Bucket(bucket_name).Object(key).upload_file('data.csv')

# Create Model

In [39]:
import sagemaker
from sagemaker.amazon.amazon_estimator  import get_image_uri
from  sagemaker import get_execution_role
key = 'model/xgb_model'
s3_output_location = url = 's3://{}/{}'.format(bucket_name, key)

xgb_model = sagemaker.estimator.Estimator(
     get_image_uri(boto3.Session().region_name, 'xgboost'),
     get_execution_role(),
     train_instance_count =1,
     train_instance_type='ml.m4.xlarge', 
     train_volume_size=5,
     output_path=s3_output_location,
     sagemaker_session=sagemaker.Session()
)

xgb_model.set_hyperparameters(max_depth=5,
                          eta=0.2,
                          gama=4,
                          min_child_weight=6,
                          silent=0,
                          objective='multi:softmax',
                          num_class=3,
                          num_round=10)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


# Train Model

In [40]:
train_data = 's3://{}/{}'.format(bucket_name, 'data/train')
val_data = 's3://{}/{}'.format(bucket_name, 'data/val')
train_channel = sagemaker.session.s3_input(train_data, content_type='text/csv')
val_channel = sagemaker.session.s3_input(val_data, content_type='text/csv')
data_channels = { 'train': train_channel, 'validation': val_channel}
xgb_model.fit(inputs=data_channels)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Creating training-job with name: xgboost-2024-05-02-21-42-16-746


2024-05-02 21:42:16 Starting - Starting the training job...
2024-05-02 21:42:33 Starting - Preparing the instances for training...
2024-05-02 21:43:14 Downloading - Downloading input data...
2024-05-02 21:43:47 Downloading - Downloading the training image......
2024-05-02 21:44:43 Training - Training image download completed. Training in progress.
2024-05-02 21:44:43 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2024-05-02:21:44:36:INFO] Running standalone xgboost training.[0m
[34m[2024-05-02:21:44:36:INFO] File size need to be processed in the node: 0.0mb. Available memory size in the node: 8481.77mb[0m
[34m[2024-05-02:21:44:36:INFO] Determined delimiter of CSV input is ','[0m
[34m[21:44:36] S3DistributionType set as FullyReplicated[0m
[34m[21:44:36] 120x4 matrix with 480 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2024-05-02:21:44:36:INFO] Determined delimiter of CSV input is ','[0m
[34m[21:44:

# Deploy Model

In [41]:
xgb_predictor = xgb_model.deploy(initial_instance_count=1,
                                instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: xgboost-2024-05-02-21-51-28-886
INFO:sagemaker:Creating endpoint-config with name xgboost-2024-05-02-21-51-28-886
INFO:sagemaker:Creating endpoint with name xgboost-2024-05-02-21-51-28-886


-----!