In [2]:
%matplotlib inline
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
import sklearn.model_selection

In [5]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer

session = sagemaker.Session()
role = get_execution_role()

In [6]:
boston = load_boston()

In [9]:
X_bos_pd = pd.DataFrame(boston.data , columns = boston.feature_names)
Y_bos_pd = pd.DataFrame(boston.target)

X_train,X_test,Y_train,Y_test = sklearn.model_selection.train_test_split(X_bos_pd,Y_bos_pd,test_size = 0.33)
X_train,X_valid,Y_train,Y_valid = sklearn.model_selection.train_test_split(X_train,Y_train,test_size = 0.33)

In [10]:
data_dir = '../data/boston'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [13]:
pd.concat([Y_valid,X_valid],axis = 1).to_csv(os.path.join(data_dir,'validation.csv'),header = False , index = False)
pd.concat([Y_train,X_train],axis = 1).to_csv(os.path.join(data_dir,'train.csv'),header = False , index = False)

In [14]:
prefix = 'boston-xgboost-deploy-hl'
train_location = session.upload_data(os.path.join(data_dir,'train.csv'),key_prefix = prefix)
val_location = session.upload_data(os.path.join(data_dir,'validation.csv'),key_prefix=prefix)

In [16]:
container = get_image_uri(session.boto_region_name,'xgboost')
xgb = sagemaker.estimator.Estimator(container,
                                    role,
                                    train_instance_count = 1,
                                    train_instance_type = 'ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(session.default_bucket,prefix),
                                    sagemaker_session = session)

In [21]:
xgb.set_hyperparameters(max_depth = 5, #建構決策樹最大深度
                        eta =0.2 , #類似於學習率
                        min_child_weight = 6, #若執行個體權重總和低於此數字，建置程序將進一步區分
                        subsample = 0.5, #為決策樹中的dropout，相似於神經網絡的dropout
                        objective = 'reg:linear', #學習任務:對應的學習目標
                        early_stopping_rounds = 10 , #驗證損失不再減少N次時停止訓練
                        num_round = 200)

In [22]:
s3_input_train =sagemaker.s3_input(s3_data = train_location , content_type='csv')
s3_input_val = sagemaker.s3_input(s3_data = val_location , content_type = 'csv')
xgb.fit({'train':s3_input_train , 'validation':s3_input_val})

INFO:sagemaker:Creating training-job with name: xgboost-2019-04-26-16-54-41-912


2019-04-26 16:54:42 Starting - Starting the training job...
2019-04-26 16:54:47 Starting - Launching requested ML instances......
2019-04-26 16:55:49 Starting - Preparing the instances for training......
2019-04-26 16:57:05 Downloading - Downloading input data
2019-04-26 16:57:05 Training - Downloading the training image..
[31mArguments: train[0m
[31m[2019-04-26:16:57:21:INFO] Running standalone xgboost training.[0m
[31m[2019-04-26:16:57:21:INFO] File size need to be processed in the node: 0.02mb. Available memory size in the node: 8409.6mb[0m
[31m[2019-04-26:16:57:21:INFO] Determined delimiter of CSV input is ','[0m
[31m[16:57:21] S3DistributionType set as FullyReplicated[0m
[31m[16:57:21] 227x13 matrix with 2951 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[2019-04-26:16:57:21:INFO] Determined delimiter of CSV input is ','[0m
[31m[16:57:21] S3DistributionType set as FullyReplicated[0m
[31m[16:57:21] 112x13 matrix with 1456

ValueError: Error for Training job xgboost-2019-04-26-16-54-41-912: Failed Reason: ClientError: Artifact upload failed:Error 7: InvalidBucketName (400): The specified bucket is not valid.

In [26]:
xgb_predictor = xgb.deploy(initial_instance_count = 1 ,instance_type = 'ml.m4.xlarge')

INFO:sagemaker:Creating model with name: xgboost-2019-04-26-16-59-14-175


ClientError: An error occurred (ValidationException) when calling the CreateModel operation: 1 validation error detected: Value at "primaryContainer.modelDataUrl" failed to satisfy constraint: Member must satisfy regular expression pattern: [\S]+

In [25]:
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer

Y_pred = xgb_predictor(X_test.values),decode('utf-8') 
Y_pred = np.fromstring(Y_pred,sep=',')

NameError: name 'xgb_predictor' is not defined

In [None]:
plt.scatter(Y_test,Y_pred)
plt.xlabel("Median Price")
plt.ylabel("Predicted Price")
plt.title("Median Price vs Predicted Price")

In [24]:
xgb_predictor.delete_endpoint()

NameError: name 'xgb_predictor' is not defined