In [131]:
import sagemaker

import numpy as np
import pandas as pd
import boto3
import re
print("libraries loaded")

libraries loaded


In [132]:
# upload data to s3
bucket_name = "deployml-sagemaker-demo"
train_data = r"bcancer/training/"
val_data = r"bcancer/validation/"

s3_model_output_location = r"s3://{0}/bcancer/model".format(bucket_name)
s3_training_file_location = r"s3://{0}/{1}".format(bucket_name, train_data)
s3_validation_file_location = r"s3://{0}/{1}".format(bucket_name, val_data)


In [133]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)

s3://deployml-sagemaker-demo/bcancer/model
s3://deployml-sagemaker-demo/bcancer/training/
s3://deployml-sagemaker-demo/bcancer/validation/


In [134]:
def write_to_s3(filename, bucket, key):
    with open(filename, "rb") as f:
        return boto3.Session().resource("s3").Bucket(bucket).Object(key).upload_fileobj(f)


In [135]:
write_to_s3("train.csv",bucket_name, train_data + "train.csv")
write_to_s3("test.csv",bucket_name, val_data + "test.csv")

In [136]:
sess = sagemaker.Session()
print("session created")

session created


In [137]:
role = sagemaker.get_execution_role()

In [138]:
role

'arn:aws:iam::349085393863:role/service-role/AmazonSageMaker-ExecutionRole-20210324T131939'

In [139]:
container = sagemaker.amazon.amazon_estimator.get_image_uri(
    sess.boto_region_name,
    "xgboost",
    "latest")

print("Sagemaker XGBoost Info :\n{} ({})".format(container, sess.boto_region_name))

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


Sagemaker XGBoost Info :
433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest (us-west-2)


In [140]:
# building the model
estimator = sagemaker.estimator.Estimator(
    container,
    role,
    train_instance_count=1,
    train_instance_type="ml.m4.xlarge",
    output_path = s3_model_output_location,
    sagemaker_session=sess,
    base_job_name="v1-xgboost-bcancer",
)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [141]:
estimator.set_hyperparameters(max_depth=3,
                             objective="binary:logistic", num_round=500)
# num_round = n_estimators
# max_depth comes from something else. 

In [142]:
estimator.hyperparameters()

{'max_depth': 3, 'objective': 'binary:logistic', 'num_round': 500}

In [143]:
training_input_config = sagemaker.session.s3_input(
    s3_data=s3_training_file_location,
    content_type="csv",
    s3_data_type="S3Prefix",
)
validation_input_config = sagemaker.session.s3_input(
    s3_data=s3_validation_file_location,
    content_type="csv",
    s3_data_type="S3Prefix",
)
data_channels = {"train": training_input_config, "validation":validation_input_config}

The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [144]:
print(training_input_config.config), print(validation_input_config.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://deployml-sagemaker-demo/bcancer/training/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://deployml-sagemaker-demo/bcancer/validation/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}


(None, None)

In [145]:
# Train the model
estimator.fit(data_channels)
# the model comes from a docker container located at
# 433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest (us-west-2)

2021-03-25 15:15:41 Starting - Starting the training job...
2021-03-25 15:15:43 Starting - Launching requested ML instancesProfilerReport-1616685340: InProgress
......
2021-03-25 15:16:57 Starting - Preparing the instances for training......
2021-03-25 15:18:07 Downloading - Downloading input data...
2021-03-25 15:18:36 Training - Downloading the training image...
2021-03-25 15:18:56 Training - Training image download completed. Training in progress.[34mArguments: train[0m
[34m[2021-03-25:15:18:56:INFO] Running standalone xgboost training.[0m
[34m[2021-03-25:15:18:56:INFO] File size need to be processed in the node: 0.2mb. Available memory size in the node: 8413.55mb[0m
[34m[2021-03-25:15:18:56:INFO] Determined delimiter of CSV input is ','[0m
[34m[15:18:56] S3DistributionType set as FullyReplicated[0m
[34m[15:18:56] 571x14 matrix with 7994 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-03-25:15:18:56:INFO] Determined delimi

In [None]:
# Deploy the model
# predictor = estimator.deploy(
#     initial_instance_count=1,
#     instance_type = "ml.m4.xlarge",
#     endpoint_name = "v1-xgboost-bcancer",
# )

### serialize model

In [146]:
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import CSVSerializer
predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type = "ml.m4.xlarge",
    endpoint_name = "v1-xgboost-bcancer",
    serializer=CSVSerializer(),
    deserializer=JSONDeserializer(),
)

-------------!

-------------!

In [147]:
# realtime endpoint definition:
endpoint_name = "v1-xgboost-bcancer"
predictor = sagemaker.predictor.RealTimePredictor(endpoint_name)

The class RealTimePredictor has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [None]:
df_test = pd.read_csv("test.csv", header=None)
df_test.head()

In [None]:
dftrain = pd.read_csv("train.csv", header=None)
dftrain.head()

In [150]:
arr_test = df_test[df_test.columns[1:]].values

In [179]:
import io

# def np2csv(arr):
#     csv = io.BytesIO()
#     np.savetxt(csv, arr, delimiter=',', fmt='%g')
#     return csv.getvalue().decode().rstrip()

def np2csv(arr):
    csv = io.BytesIO()
    np.savetxt(csv, arr, delimiter=',', fmt='%g')
    return csv.getvalue().decode().rstrip().replace("\n",",")


In [180]:
a = np2csv(arr_test[0])
b = np2csv(arr_test[0].transpose())
a == b
print(a)

0.29236,0.0827837,0.282703,0.26443,-0.0450265,0.0240417,0.0236516,0.144536,-0.0994288,-0.15405,0.06951,-0.0686936,0.0690962,0.0639425


In [181]:
payload = np2csv(arr_test[0])

In [182]:
payload

'0.29236,0.0827837,0.282703,0.26443,-0.0450265,0.0240417,0.0236516,0.144536,-0.0994288,-0.15405,0.06951,-0.0686936,0.0690962,0.0639425'

In [183]:
# result = predictor.predict(payload, initial_args={'ContentType': 'text/csv'})
##Unable to evaluate payload provided: Feature size of csv inference data 1 is not consistent with feature size of trained model 14".
# result = predictor.predict(a, initial_args={'ContentType': 'text/csv'})
##Unable to evaluate payload provided: Feature size of csv inference data 1 is not consistent with feature size of trained model 14".
# result = predictor.predict(open('test.csv','r'), initial_args={'ContentType': 'text/csv'})
##TypeError: Unicode-objects must be encoded before hashing
result = predictor.predict(payload, initial_args={'ContentType': 'text/csv'})

In [184]:
result

b'0.999983549118042'

In [190]:
payload = np2csv(arr_test[3])
result = predictor.predict(payload, initial_args={'ContentType': 'text/csv'})
result.decode()

'0.9999781847000122'

In [192]:
type(result.decode())

str