#### Import some stuff

In [None]:
import pandas as pd
import sagemaker
import boto3
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri

#### Initialize session, role, region, and conatiner

In [None]:
sess = sagemaker.Session()
role = get_execution_role()
region = boto3.Session().region_name
container = get_image_uri(boto3.Session().region_name, 'xgboost')

#### Define Load Data function

In [None]:
bucket='bryan-predictive-maintenance' 
prefix = 'sagemaker/'

def load_from_s3(fname):
    s3_client = boto3.client('s3')
    response = s3_client.get_object(Bucket=bucket, Key="{}{}.csv".format(prefix, fname))
    file = response["Body"]
    return pd.read_csv(file, low_memory=False, header=None)

train001 = load_from_s3("train001")
test001 = load_from_s3("test001")

train002 = load_from_s3("train002")
test002 = load_from_s3("test002")

#### Split training data

In [None]:
from sklearn.model_selection import train_test_split
t001, val001 = train_test_split(train001, test_size = 0.33, random_state=123)
t002, val002 = train_test_split(train002, test_size = 0.33, random_state=123)

#### Write to s3

In [None]:
bucket = 'bryan-predictive-maintenance'
prefix = 'output001'

def write_to_csv(df, fname, channel):
    # Change column order and save file locally
    df.to_csv(fname, index=False, header=False)
    
    # Create connection
    s3conn = boto3.client('s3')
    
    # Write file
    outfile = '{}/{}/{}'.format(prefix, channel, fname)
    s3conn.put_object(
            Body=open(fname),
            Bucket=bucket,
            Key=outfile
        )

write_to_csv(t001, 'train.csv', 'train')
write_to_csv(val001, 'validate.csv', 'validation')
write_to_csv(test001, 'test.csv', 'test')

#### ~~Stealing~~ Borrowing code

In [None]:
%%time
import boto3
from time import gmtime, strftime

job_name = 'bryan-was-here-xgboost-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Training job", job_name)

#Ensure that the training and validation data folders generated above are reflected in the "InputDataConfig" parameter below.

sess = sagemaker.Session()
role = get_execution_role()
region = boto3.Session().region_name
container = get_image_uri(boto3.Session().region_name, 'xgboost')
bucket = 'bryan-predictive-maintenance'
prefix = 'output001'
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region, bucket)


# In create_training_params below.. could add the following with HyperParameters as a top level key
#     "HyperParameters": {
#         "max_depth":"5",
#         "eta":"0.2",
#         "gamma":"4",
#         "min_child_weight":"6",
#         "subsample":"0.7",
#         "silent":"0",
#         "objective":"reg:linear",
#         "num_round":"50"
#     },


create_training_params = \
{
    "AlgorithmSpecification": {
        "TrainingImage": container,
        "TrainingInputMode": "File"
    },
    "RoleArn": role,
    "OutputDataConfig": {
        "S3OutputPath": bucket_path + "/" + prefix + "/single-xgboost"
    },
    "StaticHyperParameters": {
      "eval_metric": "rmse",
      "num_round": "100",
      "objective": "reg:linear",
      "rate_drop": "0.3",
      "tweedie_variance_power": "1.4"
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.m4.4xlarge",
        "VolumeSizeInGB": 5
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 3600
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": bucket_path + "/" + prefix + '/train',
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "csv",
            "CompressionType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": bucket_path + "/" + prefix + '/validation',
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "csv",
            "CompressionType": "None"
        },
        {
            "ChannelName": "test",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": bucket_path + "/" + prefix + '/test',
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "csv",
            "CompressionType": "None"
        }        
    ]
}

#### Launch model training

In [None]:
client = boto3.client('sagemaker')
client.create_training_job(**create_training_params)

import time

status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
print(status)
while status !='Completed' and status!='Failed':
    time.sleep(60)
    status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
    print(status)

#### "Borrowed" hyperparameter tuning code

In [None]:
from time import gmtime, strftime, sleep
tuning_job_name = 'xgboost-tuningjob-' + strftime("%d-%H-%M-%S", gmtime())

print (tuning_job_name)

tuning_job_config = {
    "ParameterRanges": {
      "CategoricalParameterRanges": [],
      "ContinuousParameterRanges": [
        {
          "MaxValue": "1",
          "MinValue": "0",
          "Name": "eta",
        },
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "min_child_weight",
        },
        {
          "MaxValue": "2",
          "MinValue": "0",
          "Name": "alpha",            
        }
      ],
      "IntegerParameterRanges": [
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "max_depth",
        }
      ]
    },
    "ResourceLimits": {
      "MaxNumberOfTrainingJobs": 50,
      "MaxParallelTrainingJobs": 5
    },
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {
      "MetricName": "validation:rmse",
      "Type": "Minimize"
    }
  }

In [None]:
# TrainingJobDefinition is the name of the params from the training job
client.create_hyper_parameter_tuning_job(
        HyperParameterTuningJobName = tuning_job_name,
        HyperParameterTuningJobConfig = tuning_job_config,
        TrainingJobDefinition = create_training_params
)

In [None]:
smclient.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name)['HyperParameterTuningJobStatus']