In [1]:
import os
import boto3
import re
import json
import sagemaker
import numpy as np
from sagemaker import get_execution_role
import pandas as pd
from io import BytesIO
from zipfile import ZipFile
import xgboost as xgb
from sklearn.model_selection import train_test_split
from hyperopt import STATUS_OK, fmin, tpe, Trials, hp
from hyperopt.early_stop import no_progress_loss

In [2]:
test_split = 0.2
random_state = 42
number_of_trees = 1000
model_file_name = "local-xgboost-model-scikit"
best_param = {
    'alpha': 0,
    'n_estimators': 1000,
    'booster': 'gbtree',
    'silent': 1,
    'nthread': -1,
    "colsample_bytree": 0.9,
    "gamma": 0.7000000000000001,
    "learning_rate": 0.6000000000000001,
    "max_depth": 4,
    "reg_lambda": 2,
    "subsample": 0.5,
    'objective': 'reg:pseudohubererror',
    'eval_metric': 'mphe'}

In [3]:
region = boto3.Session().region_name
print(region)
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='SageMaker-MLops')['Role']['Arn']
print(role)
bucket = sagemaker.Session().default_bucket()
print(bucket)

eu-central-1


Couldn't call 'get_role' to get Role ARN from role name ashrith-iam to get Role path.


arn:aws:iam::444133344330:role/service-role/SageMaker-MLops
sagemaker-eu-central-1-444133344330


In [4]:
remove_first_col = (
    lambda df: df[[df.columns[i] for i in range(len(df.columns)) if i != 0]]
    .astype(str)
    .astype(int)
)
loss = lambda y, y_pred: np.sum(abs(np.subtract(np.array(y), np.array(y_pred))))

In [5]:
prefix = "sagemaker/DEMO-xgboost-byo"
bucket_path = "https://s3-{}.amazonaws.com/{}".format(region, bucket)

In [6]:
# Get the data from a public S3
buf_x = (
    boto3.client("s3")
    .get_object(
        Bucket=bucket, Key="labels_x.zip"
    )["Body"].read()
)
csv_file_key = 'labels_x/image_vector.csv'
with ZipFile(BytesIO(buf_x)) as zipf:
    csv_file = zipf.open(csv_file_key)
    x = pd.read_csv(csv_file, sep=',', header=None)
print(x.head())
x.shape

     0    1    2    3    4    5    6    7    8    9   ...   70   71   72   73  \
0  13.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1  12.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2  14.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  1.0   
3  12.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4  12.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   

    74   75   76   77   78   79  
0  0.0  0.0  0.0  0.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  0.0  
4  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 80 columns]


(1000, 80)

In [7]:
# Get the y data
buf_y = (
    boto3.client("s3")
    .get_object(
        Bucket=bucket, Key="labels_y.zip"
    )["Body"].read()
)
with ZipFile(BytesIO(buf_y)) as zipf:
    files = zipf.namelist()
    # delete parent folder in list
    del files[0]
    print(files)
    y_full = pd.DataFrame(
        np.column_stack(
            [
                files,
                pd.concat(
                    (
                        pd.read_csv(zipf.open(f), sep=",", header=None)
                        for f in files
                    )
                ).values.tolist(),
            ]
        )
    )
print(y_full.shape)
print(y_full.head())
y = remove_first_col(y_full)
print(y.head())
print(y.shape)

['labels_y/frame_Giving_money_vid_900.csv', 'labels_y/frame_Giving_money_vid_34110.csv', 'labels_y/frame_Giving_money_vid_19260.csv', 'labels_y/frame_Giving_money_vid_38070.csv', 'labels_y/frame_Giving_money_vid_9360.csv', 'labels_y/frame_Giving_money_vid_13410.csv', 'labels_y/frame_Giving_money_vid_15030.csv', 'labels_y/frame_Giving_money_vid_14610.csv', 'labels_y/frame_Giving_money_vid_5430.csv', 'labels_y/frame_Giving_money_vid_17460.csv', 'labels_y/frame_Giving_money_vid_26880.csv', 'labels_y/frame_Giving_money_vid_16710.csv', 'labels_y/frame_Giving_money_vid_38610.csv', 'labels_y/frame_Giving_money_vid_14400.csv', 'labels_y/frame_Giving_money_vid_4500.csv', 'labels_y/frame_Giving_money_vid_28140.csv', 'labels_y/frame_Giving_money_vid_11880.csv', 'labels_y/frame_Giving_money_vid_16440.csv', 'labels_y/frame_Giving_money_vid_18840.csv', 'labels_y/frame_Giving_money_vid_11520.csv', 'labels_y/frame_Giving_money_vid_27480.csv', 'labels_y/frame_Giving_money_vid_38220.csv', 'labels_y/fram

In [8]:
# split test and train
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size=test_split,
    random_state=random_state,
    shuffle=True,
    stratify=y,
)
x_train = x_train.to_numpy()
x_test = x_test.to_numpy()
y_test = y_test.to_numpy()
y_train = y_train.to_numpy()
x_test.dtype
print(np.shape(x_train), np.shape(y_train), np.shape(x_test))

(800, 80) (800, 3) (200, 80)


In [9]:
bt = xgb.XGBClassifier( 
    alpha = best_param['alpha'],
    n_estimators = best_param['n_estimators'],
    booster =best_param['booster'],
    verbosity = 0,
    n_jobs = best_param['nthread'],
    colsample_bytree = best_param["colsample_bytree"],
    gamma = best_param["gamma"],
    learning_rate = best_param["learning_rate"],
    max_depth = best_param["max_depth"],
    reg_lambda= best_param["reg_lambda"],
    subsample=best_param["subsample"],
    objective = 'reg:pseudohubererror',
    eval_metric = 'mphe')   # Setup xgboost model

bt.fit(x_train, y_train, # Train it to our data
       eval_set=[(x_test, y_test)], 
       verbose=True)

[0]	validation_0-mphe:0.11169
[1]	validation_0-mphe:0.11282
[2]	validation_0-mphe:0.11262
[3]	validation_0-mphe:0.11490
[4]	validation_0-mphe:0.11536
[5]	validation_0-mphe:0.11690
[6]	validation_0-mphe:0.11851
[7]	validation_0-mphe:0.12167
[8]	validation_0-mphe:0.12251
[9]	validation_0-mphe:0.12258
[10]	validation_0-mphe:0.12305
[11]	validation_0-mphe:0.12334
[12]	validation_0-mphe:0.12217
[13]	validation_0-mphe:0.12142
[14]	validation_0-mphe:0.12201
[15]	validation_0-mphe:0.12286
[16]	validation_0-mphe:0.12399
[17]	validation_0-mphe:0.12543
[18]	validation_0-mphe:0.12675
[19]	validation_0-mphe:0.12669
[20]	validation_0-mphe:0.12802
[21]	validation_0-mphe:0.12870
[22]	validation_0-mphe:0.12896
[23]	validation_0-mphe:0.12893
[24]	validation_0-mphe:0.12950
[25]	validation_0-mphe:0.12893
[26]	validation_0-mphe:0.12836
[27]	validation_0-mphe:0.12873
[28]	validation_0-mphe:0.13047
[29]	validation_0-mphe:0.13017
[30]	validation_0-mphe:0.13105
[31]	validation_0-mphe:0.13065
[32]	validation_0-

In [10]:
model = bt
y_pred = model.predict(x_test)
print("hamming loss", loss(y_pred, y_test))

hamming loss 257.0


In [11]:
model._Booster.save_model(model_file_name)

In [12]:
!tar czvf model.tar.gz $model_file_name

local-xgboost-model-scikit


In [13]:
prefix = "sagemaker/DEMO-xgboost"
fObj = open("model.tar.gz", "rb")
key = os.path.join(prefix, model_file_name, "model.tar.gz")
boto3.Session().resource("s3").Bucket(bucket).Object(key).upload_fileobj(fObj)

In [14]:
from sagemaker import image_uris

container = image_uris.retrieve("xgboost",boto3.Session().region_name, "0.90-2")

In [15]:
%%time
from time import gmtime, strftime

model_name = model_file_name + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model_url = "https://s3-{}.amazonaws.com/{}/{}".format(region, bucket, key)
sm_client = boto3.client("sagemaker")

print(model_url)

primary_container = {
    "Image": container,
    "ModelDataUrl": model_url,
}
print(role)
create_model_response2 = sm_client.create_model(
    ModelName=model_name, ExecutionRoleArn=role, PrimaryContainer=primary_container
)

#print(create_model_response2["ModelArn"])


https://s3-eu-central-1.amazonaws.com/sagemaker-eu-central-1-444133344330/sagemaker/DEMO-xgboost/local-xgboost-model-scikit/model.tar.gz
arn:aws:iam::444133344330:role/service-role/SageMaker-MLops
CPU times: user 199 ms, sys: 8.29 ms, total: 207 ms
Wall time: 1.02 s


In [16]:
from time import gmtime, strftime

endpoint_config_name = "DEMO-XGBoostEndpointConfig-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(endpoint_config_name)
create_endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "InstanceType": "ml.m4.xlarge",
            "InitialInstanceCount": 1,
            "InitialVariantWeight": 1,
            "ModelName": model_name,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response["EndpointConfigArn"])

DEMO-XGBoostEndpointConfig-2023-07-03-13-53-49
Endpoint Config Arn: arn:aws:sagemaker:eu-central-1:444133344330:endpoint-config/demo-xgboostendpointconfig-2023-07-03-13-53-49


In [17]:
%%time
import time

endpoint_name = "DEMO-XGBoostEndpoint-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(endpoint_name)
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
)
print(create_endpoint_response["EndpointArn"])

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

DEMO-XGBoostEndpoint-2023-07-03-13-53-52
arn:aws:sagemaker:eu-central-1:444133344330:endpoint/demo-xgboostendpoint-2023-07-03-13-53-52
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws:sagemaker:eu-central-1:444133344330:endpoint/demo-xgboostendpoint-2023-07-03-13-53-52
Status: InService
CPU times: user 50.7 ms, sys: 3.85 ms, total: 54.6 ms
Wall time: 3min 1s


In [18]:
runtime_client = boto3.client("sagemaker-runtime")

In [19]:
print(np.shape(x_test))
row = x_test[0]
print(np.shape(row))
row = np.expand_dims(row, axis=0)
file_name = "test.csv"
np.savetxt(file_name, row, delimiter=',')

(200, 80)
(80,)


In [20]:
with open(file_name, "r") as f:
    payload = f.read().strip()
print(payload)

response = runtime_client.invoke_endpoint(
    EndpointName=endpoint_name, ContentType="text/csv", Body=payload
)
result = response["Body"].read().decode("ascii")
print("Predicted Class Probabilities: {}.".format(result))

4.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,0.000000000000000000e+00,2.000000000000000000e+00,

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (500) from primary and could not load the entire response body. See https://eu-central-1.console.aws.amazon.com/cloudwatch/home?region=eu-central-1#logEventViewer:group=/aws/sagemaker/Endpoints/DEMO-XGBoostEndpoint-2023-07-03-13-53-52 in account 444133344330 for more information.

In [28]:
response = sm_client.list_endpoints(StatusEquals='InService')

# Extract the endpoint names from the response
endpoint_names = [endpoint['EndpointName'] for endpoint in response['Endpoints']]
print(endpoint_names)

for e in endpoint_names:
    sm_client.delete_endpoint(EndpointName=e)

[]


In [69]:
%%time
import pickle, gzip, numpy, urllib.request, json

# Load the dataset
urllib.request.urlretrieve("https://www-labs.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz", "mnist.pkl.gz")
f = gzip.open('mnist.pkl.gz', 'rb')
train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
f.close()

CPU times: user 1.19 s, sys: 546 ms, total: 1.74 s
Wall time: 14.7 s


In [70]:
%%time

import struct
import io
import boto3

def get_dataset():
  import pickle
  import gzip
  with gzip.open('mnist.pkl.gz', 'rb') as f:
      u = pickle._Unpickler(f)
      u.encoding = 'latin1'
      return u.load()

CPU times: user 25 µs, sys: 1 µs, total: 26 µs
Wall time: 31.2 µs


In [73]:
train_set, valid_set, test_set = get_dataset()

train_X = train_set[0]
train_y = train_set[1]

valid_X = valid_set[0]
valid_y = valid_set[1]

test_X = test_set[0]
test_y = test_set[1]

print(test_X[0][0])
print(np.shape(test_X), np.shape(test_y))
test_X

0.0
(10000, 784) (10000,)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)