In [None]:
# Install the necessary libraries

!pip install -U sagemaker scikit-learn pandas boto3

In [None]:
from sagemaker import get_execution_role, Session
import sys
import os

# Get region, role, bucket

sagemaker_session = Session()
region = sagemaker_session.boto_region_name
role = get_execution_role()
bucket = sagemaker_session.default_bucket()

sys.path.insert(1, os.path.join(sys.path[0], '../'))

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

import pandas as pd

# Get IRIS Data

iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target

In [None]:
import os

# Prepare Data

os.makedirs('./data', exist_ok=True)

iris_df = iris_df[['target'] + [col for col in iris_df.columns if col != 'target']]

train_data, test_data = train_test_split(iris_df, test_size=0.2, random_state=42)

train_data.to_csv('./data/train.csv', index=False, header=False)
test_data.to_csv('./data/test.csv', index=False, header=False)

In [None]:
import boto3

# Upload Data

s3_client = boto3.client("s3")

prefix = "DEMO-scikit-iris"
TRAIN_DATA = "train.csv"
TEST_DATA = "test.csv"
WORK_DIRECTORY = "data"

train_input = sagemaker_session.upload_data(
    WORK_DIRECTORY, bucket=bucket, key_prefix="{}/{}".format(prefix, WORK_DIRECTORY)
)

s3_input_path = "s3://{}/{}/data/{}".format(bucket, prefix, TRAIN_DATA)
s3_output_path = "s3://{}/{}/output".format(bucket, prefix)

print(s3_input_path)
print(s3_output_path)

In [None]:
from sagemaker import image_uris

# Fetch XGBOOST image

image = image_uris.retrieve(framework='xgboost', region=region, version="latest")
print(image)

In [None]:
# Create TrainingJob with Boto3

import time
import boto3

client = boto3.client('sagemaker')
job_name_boto = 'xgboost-iris-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

response = client.create_training_job(
    TrainingJobName=job_name_boto,
    HyperParameters={
        'objective': 'multi:softmax',
        'num_class': '3',
        'num_round': '10',
        'eval_metric': 'merror'
    },
    AlgorithmSpecification={
        'TrainingImage': image,
        'TrainingInputMode': 'File'
    },
    RoleArn=role,
    InputDataConfig=[
        {
            'ChannelName': 'train',
            'ContentType': 'csv',
            'DataSource': {
                'S3DataSource': {
                    'S3DataType': 'S3Prefix',
                    'S3Uri': s3_input_path,
                    'S3DataDistributionType': 'FullyReplicated'
                }
            },
            'CompressionType': 'None',
            'RecordWrapperType': 'None'
        }
    ],
    OutputDataConfig={
        'S3OutputPath': s3_output_path
    },
    ResourceConfig={
        'InstanceType': 'ml.m4.xlarge',
        'InstanceCount': 1,
        'VolumeSizeInGB': 30
    },
    StoppingCondition={
        'MaxRuntimeInSeconds': 600
    }
)
print(response)

In [None]:
# Wait for TrainingJob with Boto3
import time

while True:
    response = client.describe_training_job(TrainingJobName=job_name_boto)
    status = response['TrainingJobStatus']
    if status in ['Failed', 'Completed', 'Stopped']:
        print(status)
        if status == 'Failed':
            print(response['FailureReason'])
        break
    print("-", end="")
    time.sleep(5)

In [None]:
# List TrainingJobs with Boto3
import datetime

creation_time_after = datetime.datetime.now() - datetime.timedelta(days=1)

next_token = None
while True:
    if next_token:
        response = client.list_training_jobs(CreationTimeAfter=creation_time_after, NextToken=next_token)
    else: 
        response = client.list_training_jobs(CreationTimeAfter=creation_time_after)
    
    for job in response['TrainingJobSummaries']:
        print(job['TrainingJobName'])
        
    next_token = response.get('NextToken')
    
    if not next_token:
        break

In [None]:
# Create TrainingJob V3

import time
from src.generated.resources import TrainingJob, AlgorithmSpecification, Channel, DataSource, S3DataSource, \
    OutputDataConfig, ResourceConfig, StoppingCondition

job_name_v3 = 'xgboost-iris-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

training_job = TrainingJob.create(
    training_job_name=job_name_v3,
    hyper_parameters={
        'objective': 'multi:softmax',
        'num_class': '3',
        'num_round': '10',
        'eval_metric': 'merror'
    },
    algorithm_specification=AlgorithmSpecification(
        training_image=image,
        training_input_mode='File'
    ),
    role_arn=role,
    input_data_config=[
        Channel(
            channel_name='train',
            content_type='csv',
            compression_type='None',
            record_wrapper_type='None',
            data_source=DataSource(
                s3_data_source=S3DataSource(
                    s3_data_type='S3Prefix',
                    s3_uri=s3_input_path,
                    s3_data_distribution_type='FullyReplicated'
                )
            )
        )
    ],
    output_data_config=OutputDataConfig(
        s3_output_path=s3_output_path
    ),
    resource_config=ResourceConfig(
        instance_type='ml.m4.xlarge',
        instance_count=1,
        volume_size_in_g_b=30
    ),
    stopping_condition=StoppingCondition(
        max_runtime_in_seconds=600
    )
)

In [None]:
# Wait for TrainingJob V3

training_job.wait()

In [None]:
# List TrainingJobs V3
import datetime
from src.generated.resources import TrainingJob

creation_time_after = datetime.datetime.now() - datetime.timedelta(days=1)

for job in TrainingJob.get_all(creation_time_after=creation_time_after):
    print(job.training_job_name, job.training_job_status)

In [None]:

# Creating TrainingJob using some inputs from Config File - Intelligent Defaults

import os
import time
from src.generated.resources import Cluster, TrainingJob
from src.generated.shapes import ClusterInstanceGroupSpecification, ClusterLifeCycleConfig, AlgorithmSpecification, \
    Channel, DataSource, S3DataSource, OutputDataConfig, ResourceConfig, StoppingCondition

# Setting path of Config file in environment variable 
os.environ[
    'SAGEMAKER_ADMIN_CONFIG_OVERRIDE'] = '/Users/nargokul/workspace/sagemaker-code-gen/sample/sagemaker/2017-07-24/default-configs.json'

# Generating names for resources
job_name_v3 = 'xgboost-iris-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
cluster_name_v3 = 'xgboost-cluster-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

# This will create a Cluster - one that does not have default configs in the default-configs.json and will use values from Global Defaults
cluster = Cluster.create(
    cluster_name=cluster_name_v3,
    instance_groups=[ClusterInstanceGroupSpecification(instance_count=1, instance_group_name="instance-group-11",
                                                       instance_type="ml.m5.4xlarge",
                                                       life_cycle_config=ClusterLifeCycleConfig(
                                                           source_s3_uri=s3_input_path, on_create="dothis"),
                                                       execution_role=role)
                     ]
)

# This will create a Training Job using specific VPC Config present in the default configs JSON
training_job = TrainingJob.create(
    training_job_name=job_name_v3,
    hyper_parameters={
        'objective': 'multi:softmax',
        'num_class': '3',
        'num_round': '10',
        'eval_metric': 'merror'
    },
    algorithm_specification=AlgorithmSpecification(
        training_image=image,
        training_input_mode='File'
    ),
    role_arn=role,
    input_data_config=[
        Channel(
            channel_name='train',
            content_type='csv',
            compression_type='None',
            record_wrapper_type='None',
            data_source=DataSource(
                s3_data_source=S3DataSource(
                    s3_data_type='S3Prefix',
                    s3_uri=s3_input_path,
                    s3_data_distribution_type='FullyReplicated'
                )
            )
        )
    ],
    output_data_config=OutputDataConfig(
        s3_output_path=s3_output_path
    ),
    resource_config=ResourceConfig(
        instance_type='ml.m4.xlarge',
        instance_count=1,
        volume_size_in_g_b=30
    ),
    stopping_condition=StoppingCondition(
        max_runtime_in_seconds=600
    )
)

In [None]:
from src.generated.shapes import ContainerDefinition, ProductionVariant
# Endpoint Invoking 
from src.generated.resources import Model, EndpointConfig, Endpoint, TrainingJob

os.environ[
    'SAGEMAKER_ADMIN_CONFIG_OVERRIDE'] = '/Users/nargokul/workspace/sagemaker-code-gen/sample/sagemaker/2017-07-24/default-configs.json'

model = Model.create(
    model_name='xgboost-iris-5-07',
    primary_container=ContainerDefinition(
        image='246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.7-1',
        model_data_url='s3://sagemaker-us-west-2-211125564141/DEMO-scikit-iris/output/xgboost-iris-2024-05-17-02-10-05/output/serve.tar.gz',
        # here we are getting model data from the training job 
        environment={
            'LOCAL_PYTHON': '3.10.12',
            'MODEL_CLASS_NAME': 'xgboost.sklearn.XGBClassifier',
            'SAGEMAKER_CONTAINER_LOG_LEVEL': '10',
            'SAGEMAKER_PROGRAM': 'inference.py',
            'SAGEMAKER_REGION': 'us-west-2',
            'SAGEMAKER_SERVE_SECRET_KEY': '3a459322560a181436866602ddfbb7c16ea97046e92845de43a5ac80f7604451',
            'SAGEMAKER_SUBMIT_DIRECTORY': '/opt/ml/model/code'
        }
    ),
    execution_role_arn=role,
)

# model = Model.get(model_name='xgboost-iris')
'''
if the model is already created then 
we can use Model.get() to retrive the model
'''

endpoint_config = EndpointConfig.create(
    endpoint_config_name='xgboost-iris-5-07',
    production_variants=[
        ProductionVariant(
            variant_name='xgboost-iris-5-07',
            initial_instance_count=1,
            instance_type='ml.m5.xlarge',
            model_name='xgboost-iris-5-07'
        )
    ]
)

endpoint: Endpoint = Endpoint.create(
    endpoint_name='xgboost-iris-5-07',
    endpoint_config_name='xgboost-iris-5-07'  # note we can chain it to get the name automatically
)


Invoke the created Endpoint

In [None]:
from src.generated.resources import Endpoint
from numpy import loadtxt
from sklearn.model_selection import train_test_split
from sagemaker.base_serializers import NumpySerializer

endpoint = Endpoint.get(endpoint_name='xgboost-iris-5-07')
import io
import numpy as np

dataset = loadtxt('data/pima-indians-diabetes.data.csv', delimiter=",")
# split data into X and y
X = dataset[:, 0:8]
Y = dataset[:, 8]
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
serializer = NumpySerializer()


def deserialise(response):
    return np.load(io.BytesIO(response['Body'].read()))


invoke_result = endpoint.invoke(body=serializer.serialize(X_test),
                                content_type='application/x-npy',
                                accept='application/x-npy')

print("Endpoint Response:", deserialise(invoke_result))


In [None]:
from src.generated.resources import Endpoint
from numpy import loadtxt
from sklearn.model_selection import train_test_split
from sagemaker.base_serializers import NumpySerializer

endpoint = Endpoint.get(endpoint_name='xgboost-iris-5-07')
dataset = loadtxt('data/pima-indians-diabetes.data.csv', delimiter=",")

# split data into X and y
X = dataset[:, 0:8]
Y = dataset[:, 8]
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
serializer = NumpySerializer()


def deserialise(response):
    return [
        res_part
        for res_part in response['Body']
    ]


invoke_result = endpoint.invoke_with_response_stream(body=serializer.serialize(X_test),
                                                     content_type='application/x-npy',
                                                     accept='application/x-npy')

print("Endpoint Response:", deserialise(invoke_result))