In [1]:
import os
import boto3
import re
import json
import sagemaker
import numpy as np
from sagemaker import get_execution_role
import pandas as pd
from io import BytesIO
from zipfile import ZipFile
import xgboost as xgb
from sklearn.model_selection import train_test_split
from hyperopt import STATUS_OK, fmin, tpe, Trials, hp
from hyperopt.early_stop import no_progress_loss


In [2]:
test_split = 0.2
random_state = 42
number_of_trees = 1000
model_file_name = "local-xgboost-model"
best_param = {
    'alpha': 0,
    'n_estimators': 1000,
    'booster': 'gbtree',
    'silent': 1,
    'nthread': -1,
    "colsample_bytree": 0.9,
    "gamma": 0.7000000000000001,
    "learning_rate": 0.6000000000000001,
    "max_depth": 4,
    "reg_lambda": 2,
    "subsample": 0.5,
    'objective': 'reg:pseudohubererror',
    'eval_metric': 'mphe'}

In [5]:
region = boto3.Session().region_name
print(region)
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='SageMaker-MLops')['Role']['Arn']
print(role)
bucket = sagemaker.Session().default_bucket()
print(bucket)

eu-central-1


Couldn't call 'get_role' to get Role ARN from role name ashrith-iam to get Role path.


arn:aws:iam::444133344330:role/service-role/SageMaker-MLops
sagemaker-eu-central-1-444133344330


In [6]:
remove_first_col = (
    lambda df: df[[df.columns[i] for i in range(len(df.columns)) if i != 0]]
    .astype(str)
    .astype(int)
)
loss = lambda y, y_pred: np.sum(abs(np.subtract(np.array(y), np.array(y_pred))))

In [7]:
prefix = "sagemaker/DEMO-xgboost-byo"
bucket_path = "https://s3-{}.amazonaws.com/{}".format(region, bucket)

In [8]:
%%time
# Get the data from a public S3
buf_x = (
    boto3.client("s3")
    .get_object(
        Bucket=bucket, Key="labels_x.zip"
    )["Body"].read()
)
csv_file_key = 'labels_x/image_vector.csv'
with ZipFile(BytesIO(buf_x)) as zipf:
    csv_file = zipf.open(csv_file_key)
    x = pd.read_csv(csv_file, sep=',', header=None)
print(x.head())
x.shape

     0    1    2    3    4    5    6    7    8    9   ...   70   71   72   73  \
0  13.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1  12.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2  14.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  1.0   
3  12.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4  12.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   

    74   75   76   77   78   79  
0  0.0  0.0  0.0  0.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  0.0  
4  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 80 columns]
CPU times: user 45.6 ms, sys: 4.72 ms, total: 50.3 ms
Wall time: 225 ms


(1000, 80)

In [9]:
# Get the y data
buf_y = (
    boto3.client("s3")
    .get_object(
        Bucket=bucket, Key="labels_y.zip"
    )["Body"].read()
)
with ZipFile(BytesIO(buf_y)) as zipf:
    files = zipf.namelist()
    # delete parent folder in list
    del files[0]
    print(files)
    y_full = pd.DataFrame(
        np.column_stack(
            [
                files,
                pd.concat(
                    (
                        pd.read_csv(zipf.open(f), sep=",", header=None)
                        for f in files
                    )
                ).values.tolist(),
            ]
        )
    )
print(y_full.shape)
print(y_full.head())
y = remove_first_col(y_full)
print(y.head())
print(y.shape)

['labels_y/frame_Giving_money_vid_900.csv', 'labels_y/frame_Giving_money_vid_34110.csv', 'labels_y/frame_Giving_money_vid_19260.csv', 'labels_y/frame_Giving_money_vid_38070.csv', 'labels_y/frame_Giving_money_vid_9360.csv', 'labels_y/frame_Giving_money_vid_13410.csv', 'labels_y/frame_Giving_money_vid_15030.csv', 'labels_y/frame_Giving_money_vid_14610.csv', 'labels_y/frame_Giving_money_vid_5430.csv', 'labels_y/frame_Giving_money_vid_17460.csv', 'labels_y/frame_Giving_money_vid_26880.csv', 'labels_y/frame_Giving_money_vid_16710.csv', 'labels_y/frame_Giving_money_vid_38610.csv', 'labels_y/frame_Giving_money_vid_14400.csv', 'labels_y/frame_Giving_money_vid_4500.csv', 'labels_y/frame_Giving_money_vid_28140.csv', 'labels_y/frame_Giving_money_vid_11880.csv', 'labels_y/frame_Giving_money_vid_16440.csv', 'labels_y/frame_Giving_money_vid_18840.csv', 'labels_y/frame_Giving_money_vid_11520.csv', 'labels_y/frame_Giving_money_vid_27480.csv', 'labels_y/frame_Giving_money_vid_38220.csv', 'labels_y/fram

In [10]:
# split test and train
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size=test_split,
    random_state=random_state,
    shuffle=True,
    stratify=y,
)

In [11]:
y_test = y_test
d_train = xgb.DMatrix(x_train, y_train)
d_test = xgb.DMatrix(x_test, y_test)
d_test_feature = xgb.DMatrix(x_test)

In [12]:
num_round = number_of_trees
watchlist = [
    (d_train, "train"),
    (d_test, "tests"),
]
print("loading data end, start to boost trees")
boosted_tree = xgb.train(
        best_param,
        d_train,
        num_round,
        #obj=pseudo_huber_loss,
        #custom_metric=pseudo_huber_loss_eval,
        evals=watchlist,
        verbose_eval=100,
    )
model = boosted_tree

loading data end, start to boost trees
Parameters: { "n_estimators", "silent" } are not used.

[0]	train-mphe:0.10742	tests-mphe:0.11169
[100]	train-mphe:0.07440	tests-mphe:0.13583
[200]	train-mphe:0.06852	tests-mphe:0.14530
[300]	train-mphe:0.06511	tests-mphe:0.14836
[400]	train-mphe:0.06295	tests-mphe:0.15110
[500]	train-mphe:0.06175	tests-mphe:0.15214
[600]	train-mphe:0.06093	tests-mphe:0.15173
[700]	train-mphe:0.05998	tests-mphe:0.15215
[800]	train-mphe:0.05887	tests-mphe:0.15229
[900]	train-mphe:0.05737	tests-mphe:0.15495
[999]	train-mphe:0.05628	tests-mphe:0.15397


In [13]:
y_pred = (model.predict(d_test_feature, strict_shape=True) > 0.5).astype(int)
print("Hamming loss is:", loss(y_test, y_pred))
print(y_test,y_pred)

Hamming loss is: 257
     1  2  3
390  1  1  1
188  0  1  1
618  0  0  1
753  1  0  1
490  1  1  1
..  .. .. ..
184  1  1  1
498  1  0  1
123  0  1  1
3    0  0  1
191  0  0  1

[200 rows x 3 columns] [[1 1 1]
 [1 0 0]
 [0 0 1]
 [1 0 1]
 [0 1 1]
 [1 1 0]
 [1 1 1]
 [1 0 1]
 [1 1 1]
 [0 1 1]
 [1 1 1]
 [1 1 1]
 [1 0 1]
 [0 0 1]
 [1 0 1]
 [0 0 1]
 [0 1 1]
 [1 1 1]
 [0 1 1]
 [1 1 1]
 [0 0 1]
 [0 0 1]
 [0 1 1]
 [1 0 1]
 [1 0 1]
 [1 0 1]
 [0 1 1]
 [1 0 0]
 [1 1 0]
 [0 0 1]
 [1 1 0]
 [0 1 1]
 [1 1 0]
 [1 0 1]
 [0 1 1]
 [1 1 0]
 [0 0 1]
 [0 1 1]
 [1 1 1]
 [0 0 1]
 [0 0 1]
 [1 0 1]
 [0 1 0]
 [0 1 1]
 [1 1 1]
 [0 0 1]
 [1 1 0]
 [1 0 1]
 [1 1 1]
 [0 0 1]
 [0 0 1]
 [1 1 1]
 [0 1 0]
 [0 1 1]
 [0 0 0]
 [0 0 1]
 [0 0 1]
 [1 1 1]
 [0 0 1]
 [0 0 1]
 [0 1 1]
 [1 1 0]
 [1 1 1]
 [1 0 1]
 [0 1 1]
 [1 0 0]
 [1 1 1]
 [0 0 0]
 [0 1 1]
 [0 1 1]
 [1 1 1]
 [1 1 1]
 [1 1 1]
 [0 0 1]
 [1 1 1]
 [1 1 1]
 [1 0 0]
 [0 1 0]
 [1 0 1]
 [1 0 1]
 [1 1 1]
 [1 1 1]
 [0 0 1]
 [0 1 1]
 [1 0 1]
 [0 0 1]
 [1 1 1]
 [1 1 0]
 [0 1 1

In [14]:
model.save_model(model_file_name)

In [15]:
!tar czvf model.tar.gz $model_file_name

local-xgboost-model


In [16]:
prefix = "sagemaker/DEMO-xgboost"
fObj = open("model.tar.gz", "rb")
key = os.path.join(prefix, model_file_name, "model.tar.gz")
boto3.Session().resource("s3").Bucket(bucket).Object(key).upload_fileobj(fObj)

In [17]:
from sagemaker import image_uris

container = image_uris.retrieve("xgboost",boto3.Session().region_name, "0.90-2")

In [19]:
%%time
from time import gmtime, strftime

model_name = model_file_name + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model_url = "https://s3-{}.amazonaws.com/{}/{}".format(region, bucket, key)
sm_client = boto3.client("sagemaker")

print(model_url)

primary_container = {
    "Image": container,
    "ModelDataUrl": model_url,
}
print(role)
create_model_response2 = sm_client.create_model(
    ModelName=model_name, ExecutionRoleArn=role, PrimaryContainer=primary_container
)

#print(create_model_response2["ModelArn"])


https://s3-eu-central-1.amazonaws.com/sagemaker-eu-central-1-444133344330/sagemaker/DEMO-xgboost/local-xgboost-model/model.tar.gz
arn:aws:iam::444133344330:role/service-role/SageMaker-MLops
CPU times: user 33 ms, sys: 0 ns, total: 33 ms
Wall time: 694 ms


In [20]:
from time import gmtime, strftime

endpoint_config_name = "DEMO-XGBoostEndpointConfig-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(endpoint_config_name)
create_endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "InstanceType": "ml.m4.xlarge",
            "InitialInstanceCount": 1,
            "InitialVariantWeight": 1,
            "ModelName": model_name,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response["EndpointConfigArn"])

DEMO-XGBoostEndpointConfig-2023-06-08-21-11-36
Endpoint Config Arn: arn:aws:sagemaker:eu-central-1:444133344330:endpoint-config/demo-xgboostendpointconfig-2023-06-08-21-11-36


In [21]:
%%time
import time

endpoint_name = "DEMO-XGBoostEndpoint-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(endpoint_name)
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
)
print(create_endpoint_response["EndpointArn"])

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

DEMO-XGBoostEndpoint-2023-06-08-21-16-59
arn:aws:sagemaker:eu-central-1:444133344330:endpoint/demo-xgboostendpoint-2023-06-08-21-16-59
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws:sagemaker:eu-central-1:444133344330:endpoint/demo-xgboostendpoint-2023-06-08-21-16-59
Status: InService
CPU times: user 86.5 ms, sys: 12 ms, total: 98.6 ms
Wall time: 3min 1s


In [22]:
runtime_client = boto3.client("runtime.sagemaker")

In [39]:
import json

file_name = (
    "test.csv"  # customize to your test file, will be 'mnist.single.test' if use data above
)

with open(file_name, "r") as f:
    payload = f.read().strip()


response = runtime_client.invoke_endpoint(
    EndpointName='DEMO-XGBoostEndpoint-2023-06-08-21-16-59', ContentType="text/csv", Body=payload
)
result = response["Body"].read().decode("ascii")
print("Predicted Class Probabilities: {}.".format(result))

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (500) from primary and could not load the entire response body. See https://eu-central-1.console.aws.amazon.com/cloudwatch/home?region=eu-central-1#logEventViewer:group=/aws/sagemaker/Endpoints/DEMO-XGBoostEndpoint-2023-06-08-21-16-59 in account 444133344330 for more information.