### SELECT WHICH DATASET

In [None]:
dataset = "001"

### Import some stuff

In [None]:
import pandas as pd
import sagemaker
import boto3
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri

### Initialize session, role, region, and conatiner

In [None]:
sess = sagemaker.Session()
role = get_execution_role()
region = boto3.Session().region_name
container = get_image_uri(boto3.Session().region_name, 'xgboost')

### Define Load Data function

In [61]:
def load_from_s3(fname, bucket, pre):
    s3_client = boto3.client('s3')
    response = s3_client.get_object(Bucket=bucket, Key="{}{}.csv".format(pre, fname))
    file = response["Body"]
    return pd.read_csv(file, low_memory=False, header=None)

In [63]:
# Load Data in
bucket = 'bryan-predictive-maintenance' 
prefix = 'sagemaker/'
train = load_from_s3("train{}".format(dataset), bucket, prefix)
test = load_from_s3("test{}".format(dataset), bucket, prefix)

### Split training data

In [None]:
from sklearn.model_selection import train_test_split
train, validate = train_test_split(train, test_size = 0.33, random_state=123)

### Write to s3

In [None]:
bucket = 'bryan-predictive-maintenance'
prefix = 'output{}'.format(dataset)

def write_to_csv(df, fname, channel):
    # Change column order and save file locally
    df.to_csv(fname, index=False, header=False)
    
    # Create connection
    s3conn = boto3.client('s3')
    
    # Write file
    outfile = '{}/{}/{}'.format(prefix, channel, fname)
    s3conn.put_object(
            Body=open(fname),
            Bucket=bucket,
            Key=outfile
        )

In [None]:
# Write files out
write_to_csv(train, 'train.csv', 'train')
write_to_csv(validate, 'validate.csv', 'validation')
write_to_csv(test, 'test.csv', 'test')

### "Borrowed" hyperparameter tuning code

In [None]:
sess = sagemaker.Session()
role = get_execution_role()
region = boto3.Session().region_name
container = get_image_uri(boto3.Session().region_name, 'xgboost')
bucket = 'bryan-predictive-maintenance'
prefix = 'output{}'.format(dataset)
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region, bucket)

#### Create tuning job name

In [None]:
from time import gmtime, strftime, sleep
tuning_job_name = 'xgboostHPTuning-ds{}'  #.format(dataset) + strftime("%d-%H-%M-%S", gmtime())
print tuning_job_name

#### Define model training params for use in hyperparameter tuning

In [None]:
create_training_params = \
{
    "AlgorithmSpecification": {
        "TrainingImage": container,
        "TrainingInputMode": "File"
    },
    "RoleArn": role,
    "OutputDataConfig": {
        "S3OutputPath": bucket_path + "/" + prefix + "/single-xgboost"
    },
    "StaticHyperParameters": {
      "eval_metric": "rmse",
      "num_round": "100",
      "objective": "reg:linear",
      "rate_drop": "0.3",
      "tweedie_variance_power": "1.4"
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.m4.4xlarge",
        "VolumeSizeInGB": 5
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 3600
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": bucket_path + "/" + prefix + '/train',
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "csv",
            "CompressionType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": bucket_path + "/" + prefix + '/validation',
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "csv",
            "CompressionType": "None"
        },
        {
            "ChannelName": "test",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": bucket_path + "/" + prefix + '/test',
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "csv",
            "CompressionType": "None"
        }        
    ]
}

#### Define hyperparameter params for use in hyperparameter tuning

In [None]:
tuning_job_config = {
    "ParameterRanges": {
      "CategoricalParameterRanges": [],
      "ContinuousParameterRanges": [
        {
          "MaxValue": "1",
          "MinValue": "0",
          "Name": "eta",
        },
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "min_child_weight",
        },
        {
          "MaxValue": "2",
          "MinValue": "0",
          "Name": "alpha",            
        }
      ],
      "IntegerParameterRanges": [
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "max_depth",
        }
      ]
    },
    "ResourceLimits": {
      "MaxNumberOfTrainingJobs": 50,
      "MaxParallelTrainingJobs": 5
    },
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {
      "MetricName": "validation:rmse",
      "Type": "Minimize"
    }
  }

#### Create hyper parameter tuning job

In [None]:
# TrainingJobDefinition is the name of the params from the training job
client.create_hyper_parameter_tuning_job(
        HyperParameterTuningJobName = tuning_job_name,
        HyperParameterTuningJobConfig = tuning_job_config,
        TrainingJobDefinition = create_training_params
)

### Just some fun looking at model data and analyzing training jobs

In [41]:
output = client.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName = "tuning-olsen-ds001-01-17-45-46"
)

score = output['BestTrainingJob']['FinalHyperParameterTuningJobObjectiveMetric']['Value']
job_name = output['BestTrainingJob']['TrainingJobName']
print "Best training job: {}".format(job_name)
print "Best RMSE: {}".format(score)
print "Tuned Parameters:"
for key, value in output['BestTrainingJob']['TunedHyperParameters'].iteritems():
    print "\t{}: {}".format(key, value)

Best training job: tuning-olsen-ds001-01-17-45-46-023-384e6b07
Best RMSE: 16.5349006653
Tuned Parameters:
	alpha: 0.03027522780033955
	eta: 0.08688211827344429
	max_depth: 10
	min_child_weight: 1.0764150792800096


## Set up hosting!

#### Import model into hosting

In [42]:
%%time
import boto3
from time import gmtime, strftime

# Cherry picked this model
model_name = "tuning-olsen-ds001-01-17-45-46"
print(model_name)

info = client.describe_training_job(TrainingJobName = job_name)
model_data = info['ModelArtifacts']['S3ModelArtifacts']
print(model_data)
primary_container = {
    'Image': container,
    'ModelDataUrl': model_data
}

create_model_response = client.create_model(
    ModelName = model_name,
    ExecutionRoleArn = role,
    PrimaryContainer = primary_container
)

print(create_model_response['ModelArn'])

tuning-olsen-ds001-01-17-45-46
https://s3-us-east-1.amazonaws.com/bryan-predictive-maintenance/output001/single-xgboost/tuning-olsen-ds001-01-17-45-46-023-384e6b07/output/model.tar.gz
arn:aws:sagemaker:us-east-1:023375022819:model/tuning-olsen-ds001-01-17-45-46
CPU times: user 14.4 ms, sys: 5.33 ms, total: 19.7 ms
Wall time: 361 ms


#### Create endpoint configuration

In [43]:
from time import gmtime, strftime

endpoint_config_name = 'XGboostEndpointConfig-maintenance' # + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(endpoint_config_name)
create_endpoint_config_response = client.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants=[{
        'InstanceType':'ml.m4.xlarge',
        'InitialVariantWeight':1,
        'InitialInstanceCount':1,
        'ModelName':model_name,
        'VariantName':'AllTraffic'
    }]
)

print("Endpoint Config Arn: " + create_endpoint_config_response['EndpointConfigArn'])

XGboostEndpointConfig-maintenance
Endpoint Config Arn: arn:aws:sagemaker:us-east-1:023375022819:endpoint-config/xgboostendpointconfig-maintenance


#### Create endpoint 

In [69]:
%%time
import time

endpoint_name = 'XGEndpoint-maintenance'
print(endpoint_name)
create_endpoint_response = client.create_endpoint(
    EndpointName = endpoint_name,
    EndpointConfigName = endpoint_config_name)
print(create_endpoint_response['EndpointArn'])

resp = client.describe_endpoint(EndpointName = endpoint_name)
status = resp['EndpointStatus']
print("Status: " + status)

while status=='Creating':
    time.sleep(60)
    resp = client.describe_endpoint(EndpointName = endpoint_name)
    status = resp['EndpointStatus']
    print("Status: " + status)

print("Arn: " + resp['EndpointArn'])
print("Status: " + status)

CPU times: user 13 µs, sys: 2 µs, total: 15 µs
Wall time: 11 µs


#### Run some validation on the test dataset

In [None]:
runtime_client = boto3.client('runtime.sagemaker')

In [80]:
# %%time
import json
from itertools import islice
import math
import struct

# Format test file to remove label column
bucket='bryan-predictive-maintenance' 
prefix = 'output{}/test/'.format(dataset)
df = load_from_s3('test', bucket, prefix)
df = df.fillna(0)
actual = df[0].tolist()
df[range(1,df.shape[1])].to_csv('engine_data/test_val_001.csv', index=False, header=None)

with open('engine_data/test_val_001.csv', 'r') as f:
    payload = f.read().strip()
    
response = runtime_client.invoke_endpoint(
                EndpointName=endpoint_name, 
                ContentType='text/csv', 
                Body=payload
            )
result = response['Body'].read()
result = result.decode("utf-8")
result = result.split(',')
result = [math.ceil(float(i)) for i in result]

df = pd.DataFrame(zip(result, actual), columns=['predicted', 'actual'])
print ((df.predicted - df.actual) ** 2).mean() ** .5


28.4562822589


#### DELETE ENDPOINT

In [None]:
client.delete_endpoint(EndpointName = endpoint_name)