In [1]:
from sagemaker import get_execution_role

role = get_execution_role()

In [4]:
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
session=sagemaker.Session()
container = get_image_uri(session.boto_region_name, 'xgboost')

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
There is a more up to date SageMaker XGBoost image. To use the newer image, please set 'repo_version'='1.0-1'. For example:
	get_image_uri(region, 'xgboost', '1.0-1').


In [6]:

xgb = sagemaker.estimator.Estimator(image_name=get_image_uri(session.boto_region_name,'xgboost'),
                                 role=role,
                                  train_instance_count=1,
                                  train_instance_type='ml.c5.9xlarge',
                                  train_max_run=3600,
                                  train_max_wait=3600,
                                  train_use_spot_instances=True,
                                  output_path='s3://sagemaker-us-east-1-032934527328/sentimentanalysis-hyperparam/output',
                                  sagemaker_session=session
                                 )


xgb.set_hyperparameters(max_depth=5,
                       eta=0.2,
                       gamma=4,
                       min_child_weight=6,
                       subsample=0.8,
                       objective='binary:logistic',
                       early_stopping_rounds=10,
                       num_round=200)

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
There is a more up to date SageMaker XGBoost image. To use the newer image, please set 'repo_version'='1.0-1'. For example:
	get_image_uri(region, 'xgboost', '1.0-1').
Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [11]:

from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner



xgb_hyperparameter_tuner = HyperparameterTuner(estimator=xgb,
                                              objective_metric_name='validation:rmse',
                                              objective_type = 'Minimize', # Whether we wish to minimize or maximize the metric.
                                               max_jobs = 6, # The total number of models to train
                                               max_parallel_jobs = 3, # The number of models to train in parallel
                                               hyperparameter_ranges={
                                                    'max_depth': IntegerParameter(3, 12),
                                                    'eta'      : ContinuousParameter(0.05, 0.5),
                                                    'min_child_weight': IntegerParameter(2, 8),
                                                    'subsample': ContinuousParameter(0.5, 0.9),
                                                    'gamma': ContinuousParameter(0, 10),
                                               })



In [9]:
s3_input_train = sagemaker.s3_input(s3_data='s3://sagemaker-us-east-1-032934527328/sentimentanalysis/train.csv', content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://sagemaker-us-east-1-032934527328/sentimentanalysis/validation.csv', content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [12]:
xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

In [None]:
xgb_hyperparameter_tuner.wait()

...............................................................................................!


In [18]:

xgb_attached = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job(),sagemaker_session=session)
# xgb_hyperparameter_tuner.best_training_job()

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


2021-04-14 22:49:37 Starting - Preparing the instances for training
2021-04-14 22:49:37 Downloading - Downloading input data
2021-04-14 22:49:37 Training - Training image download completed. Training in progress.
2021-04-14 22:49:37 Uploading - Uploading generated training model
2021-04-14 22:49:37 Completed - Training job completed[34mArguments: train[0m
[34m[2021-04-14:22:48:44:INFO] Running standalone xgboost training.[0m
[34m[2021-04-14:22:48:44:INFO] Setting up HPO optimized metric to be : rmse[0m
[34m[2021-04-14:22:48:44:INFO] File size need to be processed in the node: 238.47mb. Available memory size in the node: 62008.95mb[0m
[34m[2021-04-14:22:48:44:INFO] Determined delimiter of CSV input is ','[0m
[34m[22:48:44] S3DistributionType set as FullyReplicated[0m
[34m[22:48:46] 16750x5000 matrix with 83750000 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-04-14:22:48:46:INFO] Determined delimiter of CSV input is ','[0m

In [19]:
# TODO: Create a transformer object from the attached estimator. Using an instance count of 1 and an instance type of ml.m4.xlarge
#       should be more than enough.

xgb_transformer = xgb_attached.transformer(instance_count=1,instance_type='ml.m5.large')


Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


In [20]:

xgb_transformer.transform('s3://sagemaker-us-east-1-032934527328/sentimentanalysis/test.csv',
                          content_type='text/csv',
                          split_type='Line')

In [21]:
xgb_transformer.wait()

...........................[34mArguments: serve[0m
[34m[2021-04-14 23:07:55 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2021-04-14 23:07:55 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2021-04-14 23:07:55 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2021-04-14 23:07:55 +0000] [19] [INFO] Booting worker with pid: 19[0m
[34m[2021-04-14 23:07:55 +0000] [20] [INFO] Booting worker with pid: 20[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-04-14:23:07:55:INFO] Model loaded successfully for worker : 19[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-04-14:23:07:55:INFO] Model loaded successfully for worker : 20[0m
[35mArguments: serve[0m
[35m[2021-04-14 23:07:55 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[35m[2021-04-14 23:07:55 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[35m[2021-04-14 23:07:55 +0000] [1] [INFO] Using worker: gevent[0m
[35m[2021-04-14 23:07:55 +0000] [19] [INFO] Booting worker with pid: 19[0

[34m[2021-04-14:23:08:51:INFO] Sniff delimiter as ','[0m
[34m[2021-04-14:23:08:51:INFO] Determined delimiter of CSV input is ','[0m
[35m[2021-04-14:23:08:51:INFO] Sniff delimiter as ','[0m
[35m[2021-04-14:23:08:51:INFO] Determined delimiter of CSV input is ','[0m
[34m[2021-04-14:23:08:53:INFO] Sniff delimiter as ','[0m
[34m[2021-04-14:23:08:53:INFO] Determined delimiter of CSV input is ','[0m
[35m[2021-04-14:23:08:53:INFO] Sniff delimiter as ','[0m
[35m[2021-04-14:23:08:53:INFO] Determined delimiter of CSV input is ','[0m
[34m[2021-04-14:23:08:55:INFO] Sniff delimiter as ','[0m
[35m[2021-04-14:23:08:55:INFO] Sniff delimiter as ','[0m
[34m[2021-04-14:23:08:55:INFO] Determined delimiter of CSV input is ','[0m
[35m[2021-04-14:23:08:55:INFO] Determined delimiter of CSV input is ','[0m
[34m[2021-04-14:23:08:56:INFO] Sniff delimiter as ','[0m
[34m[2021-04-14:23:08:56:INFO] Determined delimiter of CSV input is ','[0m
[35m[2021-04-14:23:08:56:INFO] Sniff delimiter 

In [24]:
# xgb_transformer.output_path
!ls data-sentiment

aclImdb		   data-sentiment-processed.pkl  sagemakerready
aclImdb_v1.tar.gz  Hyperparameters		 test.csv.out


In [25]:
!aws s3 cp --recursive $xgb_transformer.output_path data-sentiment/Hyperparameters

Completed 256.0 KiB/477.0 KiB (2.2 MiB/s) with 1 file(s) remainingCompleted 477.0 KiB/477.0 KiB (3.9 MiB/s) with 1 file(s) remainingdownload: s3://sagemaker-us-east-1-032934527328/xgboost-210414-2245-003-c0cf89e9-2021-04-14-23-03-36-863/test.csv.out to data-sentiment/Hyperparameters/test.csv.out


In [35]:
import pandas as pd
import pickle
with open('data-sentiment/data-sentiment-processed.pkl','rb') as f:
    processed_data=pickle.load(f)
test_y=processed_data['test']['labels']

In [33]:
predictions = pd.read_csv('data-sentiment/Hyperparameters/test.csv.out', header=None)
predictions = [round(num) for num in predictions.squeeze().values]

In [36]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, predictions)

0.86112