In [1]:
import struct
import io
import boto3
import datetime
from time import gmtime, strftime
import time

In [2]:
import sagemaker
import boto3
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

import numpy as np                                # For matrix operations and numerical processing
import pandas as pd                               # For munging tabular data
import os 
from sagemaker.amazon.amazon_estimator import get_image_uri

region = boto3.Session().region_name    
container = get_image_uri(region, 'xgboost', '0.90-1')
smclient = boto3.Session().client('sagemaker')

role = sagemaker.get_execution_role()
sess = sagemaker.Session()

bucket = sagemaker.Session().default_bucket()                     
prefix = 'sagemaker/invoice-prediction/distributed'
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket)


In [3]:
# Param # of partitions (should be less than )
numberOfPartitions = 5

In [4]:
train = pd.read_csv('train.csv',header=None)
validation = pd.read_csv('validation.csv',header=None)

In [5]:
train = train.dropna()
validation = validation.dropna()

In [11]:
validation.shape

(135546, 71)

In [19]:
def to_libsvm(x):
    label = x[0]
    feats = list(x[1:])
    feats_lib = ' '.join(['{}:{}'.format(i + 1, el) for i, el in enumerate(feats)])
    record = '{} {}'.format(label,feats_lib)
    return record

In [20]:
def write_to_s3(fobj, bucket, key):
    return boto3.Session(region_name=region).resource('s3').Bucket(bucket).Object(key).upload_fileobj(fobj)

In [21]:
def upload_to_s3(channel_name, channel_ds):
    libsvm_records = list(channel_ds.apply(to_libsvm,axis=1))
    print(len(libsvm_records))
    num_partition = 5                                 # partition file into 5 parts
    partition_bound = int(len(channel_ds)/num_partition)

    for i in range(num_partition):
        f = io.BytesIO()
        records_subset = libsvm_records[i*partition_bound:(i+1)*partition_bound]
        print('Partition of {} records'.format(len(records_subset)))
        f.write(bytes('\n'.join(records_subset), 'utf-8')) 
        f.seek(0)
        key = "{}/{}/examples{}".format(prefix,channel_name,str(i))
        url = 's3n://{}/{}'.format(bucket, key)
        print('Writing to {}'.format(url))
        write_to_s3(f, bucket, key)
        print('Done writing to {}'.format(url))

In [24]:
def convert_data():
    channels = [('train', train), ('validation', validation)]
    for channel_name, channel_ds in channels:
        upload_to_s3(channel_name, channel_ds)

In [25]:
print (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
convert_data()
print (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

2019-09-21 19:27:41
1221302
Partition of 244260 records
Writing to s3n://sagemaker-us-east-1-452432741922/sagemaker/invoice-prediction/distributed/train/examples0
sagemaker-us-east-1-452432741922 sagemaker/invoice-prediction/distributed/train/examples0
Done writing to s3n://sagemaker-us-east-1-452432741922/sagemaker/invoice-prediction/distributed/train/examples0
Partition of 244260 records
Writing to s3n://sagemaker-us-east-1-452432741922/sagemaker/invoice-prediction/distributed/train/examples1
sagemaker-us-east-1-452432741922 sagemaker/invoice-prediction/distributed/train/examples1
Done writing to s3n://sagemaker-us-east-1-452432741922/sagemaker/invoice-prediction/distributed/train/examples1
Partition of 244260 records
Writing to s3n://sagemaker-us-east-1-452432741922/sagemaker/invoice-prediction/distributed/train/examples2
sagemaker-us-east-1-452432741922 sagemaker/invoice-prediction/distributed/train/examples2
Done writing to s3n://sagemaker-us-east-1-452432741922/sagemaker/invoice-

In [26]:
#Ensure that the train and validation data folders generated above are reflected in the "InputDataConfig" parameter below.
training_params = \
{
    "AlgorithmSpecification": {
        "TrainingImage": container,
        "TrainingInputMode": "File"
    },
    "RoleArn": role,
    "OutputDataConfig": {
        "S3OutputPath": bucket_path + "/"+ prefix + "/xgboost"
    },
    "ResourceConfig": {
        "InstanceCount": 1,   
        "InstanceType": "ml.m4.10xlarge",
        "VolumeSizeInGB": 100
    },
    "HyperParameters": {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "silent":"0",
        "objective":"reg:linear",
        "num_round": "10"
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 86400
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": bucket_path + "/"+ prefix+ '/train/',
                    "S3DataDistributionType": "FullyReplicated" 
                }
            },
            "ContentType": "libsvm",
            "CompressionType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": bucket_path + "/"+ prefix+ '/validation/',
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "libsvm",
            "CompressionType": "None"
        }
    ]
}

In [27]:
#distributed job params
distributed_job_name = 'xgboost-pricepredictions-regression' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Job name is:", distributed_job_name)

distributed_job_params = training_params
distributed_job_params['TrainingJobName'] = distributed_job_name
distributed_job_params['OutputDataConfig']['S3OutputPath'] = bucket_path + "/"+ prefix + "/xgboost-distributed"
#number of instances used for training
distributed_job_params['ResourceConfig']['InstanceCount'] = 2 # no more than 5 if there are total 5 partition files generated above

# data distribution type for train channel
distributed_job_params['InputDataConfig'][0]['DataSource']['S3DataSource']['S3DataDistributionType'] = 'ShardedByS3Key'
# data distribution type for validation channel
distributed_job_params['InputDataConfig'][1]['DataSource']['S3DataSource']['S3DataDistributionType'] = 'ShardedByS3Key'

Job name is: xgboost-pricepredictions-regression2019-09-21-19-40-29


In [28]:
%%time

sm = boto3.Session(region_name=region).client('sagemaker')

sm.create_training_job(**distributed_job_params)

status = sm.describe_training_job(TrainingJobName=distributed_job_name)['TrainingJobStatus']
print(status)
sm.get_waiter('training_job_completed_or_stopped').wait(TrainingJobName=distributed_job_name)
status = sm.describe_training_job(TrainingJobName=distributed_job_name)['TrainingJobStatus']
print("Training job ended with status: " + status)
if status == 'Failed':
    message = sm.describe_training_job(TrainingJobName=distributed_job_name)['FailureReason']
    print('Training failed with the following error: {}'.format(message))
    raise Exception('Training job failed')

InProgress
Training job ended with status: Completed
CPU times: user 98.6 ms, sys: 16.1 ms, total: 115 ms
Wall time: 4min


## Create Model

In [30]:

model_name=distributed_job_name + '-mod'
print(model_name)

info = sm.describe_training_job(TrainingJobName=distributed_job_name)
model_data = info['ModelArtifacts']['S3ModelArtifacts']
print(model_data)

primary_container = {
    'Image': container,
    'ModelDataUrl': model_data
}

create_model_response = sm.create_model(
    ModelName = model_name,
    ExecutionRoleArn = role,
    PrimaryContainer = primary_container)

print(create_model_response['ModelArn'])

xgboost-pricepredictions-regression2019-09-21-19-40-29-mod
https://s3-us-east-1.amazonaws.com/sagemaker-us-east-1-452432741922/sagemaker/invoice-prediction/distributed/xgboost-distributed/xgboost-pricepredictions-regression2019-09-21-19-40-29/output/model.tar.gz
arn:aws:sagemaker:us-east-1:452432741922:model/xgboost-pricepredictions-regression2019-09-21-19-40-29-mod


## Create endpoint configuration

In [31]:
endpoint_config_name = 'XGBoostEndpointConfig-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(endpoint_config_name)
create_endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants=[{
        'InstanceType':'ml.m4.xlarge',
        'InitialVariantWeight':1,
        'InitialInstanceCount':1,
        'ModelName':model_name,
        'VariantName':'AllTraffic'}])

print("Endpoint Config Arn: " + create_endpoint_config_response['EndpointConfigArn'])

XGBoostEndpointConfig-2019-09-21-19-47-48
Endpoint Config Arn: arn:aws:sagemaker:us-east-1:452432741922:endpoint-config/xgboostendpointconfig-2019-09-21-19-47-48


## Create Endpoint

In [None]:
endpoint_name = 'XGBoostEndpoint-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(endpoint_name)
create_endpoint_response = sm.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name)
print(create_endpoint_response['EndpointArn'])

resp = sm.describe_endpoint(EndpointName=endpoint_name)
status = resp['EndpointStatus']
print("Status: " + status)

while status=='Creating':
    time.sleep(60)
    resp = sm.describe_endpoint(EndpointName=endpoint_name)
    status = resp['EndpointStatus']
    print("Status: " + status)

print("Arn: " + resp['EndpointArn'])
print("Status: " + status)

XGBoostEndpoint-2019-09-21-19-47-53
arn:aws:sagemaker:us-east-1:452432741922:endpoint/xgboostendpoint-2019-09-21-19-47-53
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating


In [36]:
runtime_client = boto3.client('runtime.sagemaker', region_name=region)

In [54]:
import botocore
def download_from_s3(channel_name, number):
    filename = 'examples{}'.format(number)
    key = "{}/{}/{}".format(prefix,channel_name,filename)
    s3 = boto3.resource('s3', region_name = region)
    try:
        s3.Bucket(bucket).download_file(key, filename)
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print('The object does not exist at {}.'.format(key+filename))
        else:
            raise        
        

In [55]:
download_from_s3('validation', 0) # reading the first part file within test

sagemaker/invoice-prediction/distributed/validation/examples0 examples0


In [59]:
!head -1 examples0

22307.401 1:17793.093888888892 2:16508.30175 3:6137.864423836161 4:0.0 5:12505.536999999998 6:0.0 7:0.0 8:1.0 9:0.0 10:0.0 11:0.0 12:0.0 13:0.0 14:0.0 15:0.0 16:0.0 17:0.0 18:0.0 19:1.0 20:0.0 21:0.0 22:0.0 23:0.0 24:0.0 25:0.0 26:0.0 27:0.0 28:0.0 29:0.0 30:0.0 31:0.0 32:0.0 33:0.0 34:0.0 35:0.0 36:0.0 37:0.0 38:0.0 39:0.0 40:0.0 41:0.0 42:0.0 43:0.0 44:0.0 45:0.0 46:0.0 47:0.0 48:0.0 49:0.0 50:0.0 51:0.0 52:0.0 53:0.0 54:0.0 55:0.0 56:0.0 57:0.0 58:0.0 59:0.0 60:0.0 61:0.0 62:0.0 63:0.0 64:0.0 65:0.0 66:0.0 67:0.0 68:0.0 69:1.0 70:0.0


In [57]:
!head -1 examples0 > singleexamples0

In [58]:
import json

file_name = 'singleexamples0' #customize to your test file 'mnist.single.test' if use the data above

with open(file_name, 'r') as f:
    payload = f.read()

response = runtime_client.invoke_endpoint(EndpointName=endpoint_name, 
                                   ContentType='text/x-libsvm', 
                                   Body=payload)
result = response['Body'].read().decode('ascii')
print('Predicted label is {}.'.format(result))

Predicted label is 16245.03515625.
